blob: b73d2bdf73e0ac9c151d1528c076766bbd82603a [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Viet-Hoa Do0d05b662022-09-09 15:39:05 +01002 * Copyright (c) 2016-2022 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas7891a732021-08-20 21:39:25 +010024#include "src/cpu/kernels/CpuMulKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
Sheri Zhang1e3ab422021-03-16 17:35:08 +000026#include "arm_compute/core/ITensor.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/TensorInfo.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010028#include "src/core/CPP/Validate.h"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010029#include "src/core/NEON/NEAsymm.h"
30#include "src/core/NEON/NESymm.h"
31#include "src/core/NEON/wrapper/wrapper.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010032#include "src/core/helpers/AutoConfiguration.h"
33#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034
35#include <arm_neon.h>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010036
Fadi Arafeh73bb6b72022-10-06 16:20:14 +000037namespace
38{
Gunes Bayirf16973b2022-11-29 13:12:08 +000039#if defined(ENABLE_FP32_KERNELS)
Fadi Arafeh73bb6b72022-10-06 16:20:14 +000040 static constexpr size_t default_mws_N1_fp32_neon = 22447;
41 static constexpr size_t default_mws_V1_fp32_neon = 38982;
Gunes Bayirf16973b2022-11-29 13:12:08 +000042#endif /* ENABLE_FP32_KERNELS */
fadara01e112ef12022-11-22 18:25:55 +000043 static constexpr size_t default_mws_other_platforms_1d_tensor = 10240;
Fadi Arafeh73bb6b72022-10-06 16:20:14 +000044}
Anthony Barbier6ff3b192017-09-04 18:44:23 +010045namespace arm_compute
46{
Sheri Zhang1e3ab422021-03-16 17:35:08 +000047namespace cpu
48{
49namespace kernels
50{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010051namespace
52{
53const float scale255_constant = 1.f / 255.f;
54const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
55const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
56
Sheri Zhang1e3ab422021-03-16 17:35:08 +000057inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000058{
59 ARM_COMPUTE_UNUSED(overflow_policy);
60 ARM_COMPUTE_UNUSED(rounding_policy);
61
Sheri Zhang1e3ab422021-03-16 17:35:08 +000062 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);
63 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
SiCong Libb88f892020-08-28 11:18:47 +010064 DataType::F32);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000065 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
SiCong Libb88f892020-08-28 11:18:47 +010066 DataType::F32);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000067 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
Michele Di Giorgio9428a182020-03-30 14:10:20 +010068 DataType::S16, DataType::QSYMM16,
69 DataType::S32, DataType::F16, DataType::F32);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000070 if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
Pablo Tello52ea9c22019-12-10 11:28:53 +000071 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +000072 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
Georgios Pinitasd7d7e902019-12-18 15:40:54 +000073 ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
Pablo Tello52ea9c22019-12-10 11:28:53 +000074 }
Manuel Bottini79fa9a22019-02-22 17:54:22 +000075
Sheri Zhang1e3ab422021-03-16 17:35:08 +000076 if(dst->total_size() > 0)
Manuel Bottini79fa9a22019-02-22 17:54:22 +000077 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +000078 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
79 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
Manuel Bottini79fa9a22019-02-22 17:54:22 +000080 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
SiCong Libb88f892020-08-28 11:18:47 +010081 // clang-format off
82 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Sheri Zhang1e3ab422021-03-16 17:35:08 +000083 !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) &&
84 !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
85 !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) &&
86 !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
87 !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
88 !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
SiCong Libb88f892020-08-28 11:18:47 +010089 , "Invalid data type combination");
90 // clang-format on
Sheri Zhang1e3ab422021-03-16 17:35:08 +000091 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");
Manuel Bottini79fa9a22019-02-22 17:54:22 +000092 }
Michalis Spyrou861f0db2018-02-26 16:47:58 +000093
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000094 if(std::abs(scale - scale255_constant) < 0.00001f)
95 {
96 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000097 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,
98 "Scale == 1/255 is not supported if input and dst are of data type S32");
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000099 }
100 else
101 {
102 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
103
104 int exponent = 0;
105 const float normalized_mantissa = std::frexp(scale, &exponent);
106
107 // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
108 // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
109 // Moreover, it will be negative as we deal with 1/2^n
110 ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
111 }
112
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000113 return Status{};
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +0000114}
115
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100116/* Scales a given vector by 1/255.
117 *
118 * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
119 *
120 * @param in Input vector to scale.
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000121 * @return Scaled dst rounded to nearest (round half up).
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100122 */
123inline int32x4_t scale255_S32_S32(int32x4_t in)
124{
125 // Scale
126 const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
127 // Round to nearest (round half up)
128 // Add +0.5 for all values
129 // Afterwards vcvt rounds toward zero
130 return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
131}
132
133inline uint16x8_t scale255_U16_U16(uint16x8_t in)
134{
135 const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
136 const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
137 return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
138}
139
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100140template <typename T>
141inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type
142vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000143{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100144 return vquantize_signed(val, info);
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000145}
146
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100147template <typename T>
148inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type
149vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Pablo Tello52ea9c22019-12-10 11:28:53 +0000150{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100151 return vquantize(val, info);
Pablo Tello52ea9c22019-12-10 11:28:53 +0000152}
153
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100154template <typename T>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000155void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100156{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100157 // Create input windows
158 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000159 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
160 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100161
162 // Clear X Dimension on execution window as we handle manually
163 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100164
Sheri Zhanga449a362020-07-16 15:52:25 +0100165 const int window_step_x = 16 / sizeof(T);
166 const auto window_start_x = static_cast<int>(window.x().start());
167 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000168 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100169
Sheri Zhanga449a362020-07-16 15:52:25 +0100170 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
171 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100172
Sheri Zhanga449a362020-07-16 15:52:25 +0100173 if(is_broadcast_across_x)
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100174 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100175 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
176 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
177 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000178 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
179 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Sheri Zhanga449a362020-07-16 15:52:25 +0100180 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
181 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100182
Sheri Zhanga449a362020-07-16 15:52:25 +0100183 // Clear X Dimension on execution window as we handle manually
184 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
185
186 Iterator broadcast_input(broadcast_tensor, broadcast_win);
187 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000188 Iterator dst(out, win);
Sheri Zhanga449a362020-07-16 15:52:25 +0100189
190 using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
191
Omar Al Khatibb230a1f2022-11-01 17:01:24 +0000192 execute_window_loop(
193 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100194 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100195 const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000196 const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100197
Sheri Zhanga449a362020-07-16 15:52:25 +0100198 const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
199 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100200
Sheri Zhanga449a362020-07-16 15:52:25 +0100201 // Compute window_step_x elements per iteration
202 int x = window_start_x;
203 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100204 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100205 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100206
Sheri Zhanga449a362020-07-16 15:52:25 +0100207 // Dequantize inputs
208 const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
209 const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100210
Sheri Zhanga449a362020-07-16 15:52:25 +0100211 const float32x4x4_t out_f32x4x4 =
212 {
213 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
214 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
215 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
216 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
217 };
218
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000219 // Quantize dst
Sheri Zhanga449a362020-07-16 15:52:25 +0100220 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
221 wrapper::vstore(output_ptr + x, result);
222 }
223
224 // Compute left-over elements
225 for(; x < window_end_x; ++x)
226 {
227 // Dequantize inputs
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000228 const T src1 = *(non_broadcast_input_ptr + x);
229 const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100230 const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
231 const float tmp_f = tmp_in1 * tmp_in2;
Sheri Zhanga449a362020-07-16 15:52:25 +0100232
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000233 // Quantize dst
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100234 const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
Sheri Zhanga449a362020-07-16 15:52:25 +0100235 *(output_ptr + x) = tmp_qua;
236 }
237 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000238 broadcast_input, non_broadcast_input, dst);
Sheri Zhanga449a362020-07-16 15:52:25 +0100239 }
240 else
241 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000242 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
243 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
Sheri Zhanga449a362020-07-16 15:52:25 +0100244
245 // Clear X Dimension on execution window as we handle manually
246 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
247 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
248
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000249 Iterator input1(src1, input1_win);
250 Iterator input2(src2, input2_win);
251 Iterator dst(out, win);
Sheri Zhanga449a362020-07-16 15:52:25 +0100252
Omar Al Khatib605a9282022-11-01 17:01:24 +0000253 execute_window_loop(
254 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100255 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100256 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
257 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000258 const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100259
Sheri Zhanga449a362020-07-16 15:52:25 +0100260 // Compute window_step_x elements per iteration
261 int x = window_start_x;
262 for(; x <= (window_end_x - window_step_x); x += window_step_x)
263 {
264 const auto input1_q = wrapper::vloadq(input1_ptr + x);
265 const auto input2_q = wrapper::vloadq(input2_ptr + x);
266
267 // Dequantize inputs
268 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
269 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
270
271 const float32x4x4_t out_f32x4x4 =
272 {
273 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
274 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
275 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
276 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
277 };
278
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000279 // Quantize dst
Sheri Zhanga449a362020-07-16 15:52:25 +0100280 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
281 wrapper::vstore(output_ptr + x, result);
282 }
283
284 // Compute left-over elements
285 for(; x < window_end_x; ++x)
286 {
287 // Dequantize inputs
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000288 const T src1 = *(input1_ptr + x);
289 const T src2 = *(input2_ptr + x);
290 const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
291 const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100292 const float tmp_f = tmp_in1 * tmp_in2;
Sheri Zhanga449a362020-07-16 15:52:25 +0100293
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000294 // Quantize dst
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100295 const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
Sheri Zhanga449a362020-07-16 15:52:25 +0100296 *(output_ptr + x) = tmp_qua;
297 }
298 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000299 input1, input2, dst);
Sheri Zhanga449a362020-07-16 15:52:25 +0100300 }
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100301}
302
Omar Al Khatib605a9282022-11-01 17:01:24 +0000303bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, float scale)
304{
305 const auto iq0 = src0->quantization_info().uniform();
306 const auto iq1 = src1->quantization_info().uniform();
307 const auto oq = dst->quantization_info().uniform();
308
309 const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale;
310
311 if(multiplier < -8191.f || multiplier > 8191.f)
312 {
313 //The multiplier cannot be stored as a 14.18 signed fixed-point number
314 return false;
315 }
316
317 const auto offset_out = float(oq.offset);
318
319 const auto max_result = multiplier * (256) * (256) + offset_out;
320
321 if(max_result > 8191.f)
322 {
323 //It might not be possible to store the result as a 14.18 signed fixed-point number.
324 return false;
325 }
326
327 return true;
328}
329
330template <typename ScalarType>
331void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const Window &window, float scale)
332{
333 const auto in0_info = src0->info();
334 const auto in1_info = src1->info();
335
336 const auto &in0_shape = in0_info->tensor_shape();
337 const auto &in1_shape = in1_info->tensor_shape();
338
339 // Create input windows.
340 Window in0_win = window.broadcast_if_dimension_le_one(in0_shape);
341 Window in1_win = window.broadcast_if_dimension_le_one(in1_shape);
342
343 // Clear the x dimension on the execution window as we process the whole row each iteration.
344 Window win = window;
345 win.set(Window::DimX, Window::Dimension(0, 1, 1));
346
347 constexpr int window_step_x = 16;
348 const auto window_start_x = window.x().start();
349 const auto window_end_x = window.x().end();
350 const auto is_broadcast_across_x = in0_shape.x() != in1_shape.x();
351
352 const auto iq0_info = in0_info->quantization_info().uniform();
353 const auto iq1_info = in1_info->quantization_info().uniform();
354 const auto oq_info = dst->info()->quantization_info().uniform();
355
356 const auto in0_offset = iq0_info.offset;
357 const auto in1_offset = iq1_info.offset;
358 const auto out_offset = oq_info.offset;
359 const auto multiplier = ((iq0_info.scale * iq1_info.scale) / oq_info.scale) * scale;
360
361 constexpr int32_t two_pwr18i = 262144;
362 constexpr float two_pwr18f = 262144.f;
363
364 const auto in0_offset_16p0 = static_cast<int16_t>(in0_offset);
365 const auto in1_offset_16p0 = static_cast<int16_t>(in1_offset);
366 const auto out_offset_14p18 = static_cast<int32_t>(out_offset * two_pwr18i);
367 const auto multiplier_14p18 = static_cast<int32_t>(multiplier * two_pwr18f);
368
369 if(is_broadcast_across_x)
370 {
371 // Prefix: a = non-broadcast, b = broadcast.
372
373 const auto is_broadcast_input_1 = in1_win.x().step() == 0;
374 auto a_win = is_broadcast_input_1 ? in0_win : in1_win;
375 auto b_win = is_broadcast_input_1 ? in1_win : in0_win;
376 const auto a_tensor = is_broadcast_input_1 ? src0 : src1;
377 const auto b_tensor = is_broadcast_input_1 ? src1 : src0;
378
379 const auto a_offset_16p0 = is_broadcast_input_1 ? in0_offset_16p0 : in1_offset_16p0;
380 const auto b_offset_16p0 = is_broadcast_input_1 ? in1_offset : in0_offset;
381#ifndef __aarch64__
382 const auto a_offset = is_broadcast_input_1 ? in0_offset : in1_offset;
383 const auto b_offset = is_broadcast_input_1 ? in1_offset : in0_offset;
384#endif //__aarch64__
385 const auto a_voffset_16p0 = wrapper::vdup_n(a_offset_16p0, wrapper::traits::vector_64_tag());
386
387 // Clear the x dimension on the execution window as we process the whole row each iteration.
388 a_win.set(Window::DimX, Window::Dimension(0, 1, 1));
389
390 Iterator a_input_it(a_tensor, a_win);
391 Iterator b_input_it(b_tensor, b_win);
392 Iterator out_it(dst, win);
393
394 execute_window_loop(
395 win, [&](const Coordinates &)
396 {
397 const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
398 const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
399 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
400
401 const auto b_val = *b_ptr;
402 const auto b_offseted_32p0 = static_cast<int32_t>(b_val - b_offset_16p0);
403 const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());
404
405 const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
406 const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
407
408 int x = window_start_x;
409
410 for(; x <= (window_end_x - window_step_x); x += window_step_x)
411 {
412 // Load the inputs.
413 const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
414
415 // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
416 const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
417 const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
418
419 const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);
420 const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);
421 const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);
422 const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);
423
424 const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);
425 const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);
426 const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);
427 const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);
428
429 const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
430 const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
431 const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
432 const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
433
434 // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
435 const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
436 const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
437 const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
438 const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
439
440 const auto vout_15p1_0 = wrapper::vcombine(
441 vout_15p1_00,
442 vout_15p1_01);
443
444 const auto vout_15p1_1 = wrapper::vcombine(
445 vout_15p1_10,
446 vout_15p1_11);
447 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
448
449 const auto vout_8p0 = wrapper::vcombine(
450 wrapper::vqrshrn<2>(vout_15p1_0),
451 wrapper::vqrshrn<2>(vout_15p1_1));
452 wrapper::vstore(out_ptr + x, vout_8p0);
453 }
454
455 //Process the left-over elements.
456 for(; x < window_end_x; ++x)
457 {
458#ifdef __aarch64__
Omar Al Khatibb230a1f2022-11-01 17:01:24 +0000459 out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(
Omar Al Khatib605a9282022-11-01 17:01:24 +0000460 b_val) - b_offset_16p0)) + out_offset_14p18)));
461#else //__aarch64__
462 out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));
463#endif //__aarch64__
464 }
465 },
466 a_input_it, b_input_it, out_it);
467 }
468 else
469 {
470 const auto voffset0_16p0 = wrapper::vdup_n(in0_offset_16p0, wrapper::traits::vector_64_tag());
471 const auto voffset1_16p0 = wrapper::vdup_n(in1_offset_16p0, wrapper::traits::vector_64_tag());
472 const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
473 const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
474
475 // Clear the x dimension on the execution window as we process the whole row each iteration.
476 in0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
477 in1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
478
479 Iterator in0_it(src0, in0_win);
480 Iterator in1_it(src1, in1_win);
481 Iterator out_it(dst, win);
482
483 execute_window_loop(
484 win, [&](const Coordinates &)
485 {
486 const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
487 const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
488 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
489
490 int x = window_start_x;
491
492 for(; x <= (window_end_x - window_step_x); x += window_step_x)
493 {
494 // Load the inputs.
495 const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
496 const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
497
498 // Widen the input elements to signed 16-bit regardless of the input signedness.
499 const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
500 const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
501 const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
502 const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
503
504 const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);
505 const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);
506 const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);
507 const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);
508
509 const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);
510 const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);
511 const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);
512 const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);
513
514 const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);
515 const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);
516 const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);
517 const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);
518
519 const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
520 const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
521 const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
522 const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
523
524 // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
525 const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
526 const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
527 const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
528 const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
529
530 const auto vout_14p2_0 = wrapper::vcombine(
531 vout_14p2_00,
532 vout_14p2_01);
533
534 const auto vout_14p2_1 = wrapper::vcombine(
535 vout_14p2_10,
536 vout_14p2_11);
537
538 const auto vout_8p0 = wrapper::vcombine(
539 wrapper::vqrshrn<2>(vout_14p2_0),
540 wrapper::vqrshrn<2>(vout_14p2_1));
541 wrapper::vstore(out_ptr + x, vout_8p0);
542 }
543
544 //Process the left-over elements.
545 for(; x < window_end_x; ++x)
546 {
547#ifdef __aarch64__
Omar Al Khatibb230a1f2022-11-01 17:01:24 +0000548 out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) * (int32_t(
Omar Al Khatib605a9282022-11-01 17:01:24 +0000549 in1_ptr[x]) - in1_offset_16p0)) + out_offset_14p18)));
550#else //__aarch64__
551 out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) + float(out_offset)));
552#endif //__aarch64__
553 }
554 },
555 in0_it, in1_it, out_it);
556 }
557}
558
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000559void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100560{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000561 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
562 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100563 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
564
565 // Create input windows
566 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000567 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
568 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100569
570 // Clear X Dimension on execution window as we handle manually
571 win.set(Window::DimX, Window::Dimension(0, 1, 1));
572 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
573 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
574
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000575 Iterator input1(src1, input1_win);
576 Iterator input2(src2, input2_win);
577 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100578
579 const int window_step_x = 16;
580 const auto window_start_x = static_cast<int>(window.x().start());
581 const auto window_end_x = static_cast<int>(window.x().end());
582
583 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
584
Omar Al Khatib605a9282022-11-01 17:01:24 +0000585 execute_window_loop(
586 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100587 {
588 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
589 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000590 const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100591
592 // Compute window_step_x elements per iteration
593 int x = window_start_x;
594 for(; x <= (window_end_x - window_step_x); x += window_step_x)
595 {
596 const qsymm16x8x2_t input1_q =
597 {
598 {
599 vld1q_s16(input1_ptr + x),
600 vld1q_s16(input1_ptr + x + 8),
601 }
602 };
603 const qsymm16x8x2_t input2_q =
604 {
605 {
606 vld1q_s16(input2_ptr + x),
607 vld1q_s16(input2_ptr + x + 8),
608 }
609 };
610
611 // Dequantize inputs
612 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
613 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
614
615 const float32x4x4_t out_f32x4x4 =
616 {
617 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
618 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
619 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
620 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
621 };
622
623 const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
624 vst1q_s16(output_ptr + x, result.val[0]);
625 vst1q_s16(output_ptr + x + 8, result.val[1]);
626 }
627
628 // Compute left-over elements
629 for(; x < window_end_x; ++x)
630 {
631 // Dequantize inputs
632 float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
633 float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
634 float tmp_f = tmp_in1 * tmp_in2;
635
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000636 // Quantize dst, lrintf() has same rounding mode as vcombine_s16
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100637 int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
638 qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
639 *(output_ptr + x) = tmp_qua;
640 }
641 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000642 input1, input2, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100643}
644
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000645void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100646{
647 ARM_COMPUTE_UNUSED(scale);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100648
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100649 // Create input windows
650 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000651 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
652 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100653
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100654 // Clear X Dimension on execution window as we handle manually
655 win.set(Window::DimX, Window::Dimension(0, 1, 1));
656 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
657 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100658
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000659 Iterator input1(src1, input1_win);
660 Iterator input2(src2, input2_win);
661 Iterator dst(out, win);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100662
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100663 const int window_step_x = 16;
664 const auto window_start_x = static_cast<int>(window.x().start());
665 const auto window_end_x = static_cast<int>(window.x().end());
666
Omar Al Khatib605a9282022-11-01 17:01:24 +0000667 execute_window_loop(
668 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100669 {
670 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
671 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000672 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100673
674 // Compute window_step_x elements per iteration
675 int x = window_start_x;
676 for(; x <= (window_end_x - window_step_x); x += window_step_x)
677 {
678 const qsymm16x8x2_t input1_q =
679 {
680 {
681 vld1q_s16(input1_ptr + x),
682 vld1q_s16(input1_ptr + x + 8),
683 }
684 };
685 const qsymm16x8x2_t input2_q =
686 {
687 {
688 vld1q_s16(input2_ptr + x),
689 vld1q_s16(input2_ptr + x + 8),
690 }
691 };
692
693 const int32x4x4_t in1_s32 =
694 {
695 {
696 vmovl_s16(vget_low_s16(input1_q.val[0])),
697 vmovl_s16(vget_high_s16(input1_q.val[0])),
698 vmovl_s16(vget_low_s16(input1_q.val[1])),
699 vmovl_s16(vget_high_s16(input1_q.val[1])),
700 }
701 };
702 const int32x4x4_t in2_s32 =
703 {
704 {
705 vmovl_s16(vget_low_s16(input2_q.val[0])),
706 vmovl_s16(vget_high_s16(input2_q.val[0])),
707 vmovl_s16(vget_low_s16(input2_q.val[1])),
708 vmovl_s16(vget_high_s16(input2_q.val[1])),
709 }
710 };
711
712 const int32x4x4_t result =
713 {
714 {
715 vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
716 vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
717 vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
718 vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
719 }
720 };
721
722 vst1q_s32(output_ptr + x, result.val[0]);
723 vst1q_s32(output_ptr + x + 4, result.val[1]);
724 vst1q_s32(output_ptr + x + 8, result.val[2]);
725 vst1q_s32(output_ptr + x + 12, result.val[3]);
726 }
727
728 // Compute left-over elements
729 for(; x < window_end_x; ++x)
730 {
731 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
732 *(output_ptr + x) = tmp;
733 }
734 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000735 input1, input2, dst);
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100736}
737
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100738template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000739void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100740{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100741 // Create input windows
742 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000743 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
744 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100745
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100746 // Clear X Dimension on execution window as we handle manually
747 win.set(Window::DimX, Window::Dimension(0, 1, 1));
748 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
749 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100750
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000751 Iterator input1(src1, input1_win);
752 Iterator input2(src2, input2_win);
753 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100754
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100755 const int window_step_x = 16 / sizeof(uint8_t);
756 const auto window_start_x = static_cast<int>(window.x().start());
757 const auto window_end_x = static_cast<int>(window.x().end());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100758
Omar Al Khatib605a9282022-11-01 17:01:24 +0000759 execute_window_loop(
760 win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100761 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100762 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
763 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000764 const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100765
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100766 // Compute window_step_x elements per iteration
767 int x = window_start_x;
768 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100769 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100770 const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
771 const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100772
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100773 uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
774 const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
775 uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
776 const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
777
778 tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
779 tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
780
781 if(is_scale255)
782 {
783 tmp1_high = scale255_U16_U16(tmp1_high);
784 tmp1_low = scale255_U16_U16(tmp1_low);
785 }
786 else
787 {
788 const int16x8_t vn = vdupq_n_s16(-n);
789
790 if(is_sat)
791 {
792 tmp1_high = vqshlq_u16(tmp1_high, vn);
793 tmp1_low = vqshlq_u16(tmp1_low, vn);
794 }
795 else
796 {
797 tmp1_high = vshlq_u16(tmp1_high, vn);
798 tmp1_low = vshlq_u16(tmp1_low, vn);
799 }
800 }
801 if(is_sat)
802 {
Viet-Hoa Do0d05b662022-09-09 15:39:05 +0100803 vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100804 }
805 else
806 {
Viet-Hoa Do0d05b662022-09-09 15:39:05 +0100807 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100808 }
809 }
810
811 // Compute left-over elements
812 for(; x < window_end_x; ++x)
813 {
814 uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
815
816 if(is_scale255)
817 {
818 float tmp_f = static_cast<float>(tmp) * scale255_constant;
819 tmp = static_cast<uint16_t>(tmp_f + 0.5f);
820 }
821 else
822 {
823 tmp >>= n;
824 }
825 if(is_sat && tmp > 255)
826 {
827 tmp = 255;
828 }
829 *(output_ptr + x) = static_cast<uint8_t>(tmp);
830 }
831 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000832 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100833}
834
835template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000836inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100837{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000838 int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1));
839 const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));
840 int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1));
841 const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100842
843 tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
844 tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
845
846 if(is_scale255)
847 {
848 tmp1_high = scale255_S32_S32(tmp1_high);
849 tmp1_low = scale255_S32_S32(tmp1_low);
850 }
851 else
852 {
853 // Right shift amount
854 const int32x4_t vn = vdupq_n_s32(-n);
855 // Left shift amount
856 const int32x4_t vnl = vdupq_n_s32(n);
857 // Calculate conversion bit
858 const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
859 const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
860 const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
861 const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
862 const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
863 const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
864 const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
865 const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
866 if(is_sat)
867 {
868 tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
869 tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
870 }
871 else
872 {
873 tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
874 tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
875 }
876 }
877
878 if(is_sat)
879 {
880 return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
881 }
882 else
883 {
884 return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
885 }
886}
887
888template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000889inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100890{
891 const int16x8x2_t result =
892 {
893 {
894 // First 8 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000895 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100896 // Second 8 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000897 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100898 }
899 };
900
901 return result;
902}
903
904template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000905void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100906{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100907 // Create input windows
908 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000909 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
910 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100911
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100912 // Clear X Dimension on execution window as we handle manually
913 win.set(Window::DimX, Window::Dimension(0, 1, 1));
914 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
915 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100916
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000917 Iterator input1(src1, input1_win);
918 Iterator input2(src2, input2_win);
919 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100920
921 const int window_step_x = 16;
922 const auto window_start_x = static_cast<int>(window.x().start());
923 const auto window_end_x = static_cast<int>(window.x().end());
924
Omar Al Khatib605a9282022-11-01 17:01:24 +0000925 execute_window_loop(
926 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100927 {
928 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
929 const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000930 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100931
932 // Compute window_step_x elements per iteration
933 int x = window_start_x;
934 for(; x <= (window_end_x - window_step_x); x += window_step_x)
935 {
936 const int16x8x2_t ta1 =
937 {
938 {
939 vld1q_s16(input1_ptr + x),
940 vld1q_s16(input1_ptr + x + 8),
941 }
942 };
943 const int16x8x2_t ta2 =
944 {
945 {
946 vld1q_s16(input2_ptr + x),
947 vld1q_s16(input2_ptr + x + 8),
948 }
949 };
950 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
951
952 vst1q_s16(output_ptr + x, result.val[0]);
953 vst1q_s16(output_ptr + x + 8, result.val[1]);
954 }
955
956 // Compute left-over elements
957 for(; x < window_end_x; ++x)
958 {
959 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
960
961 if(is_scale255)
962 {
963 float tmp_f = static_cast<float>(tmp) * scale255_constant;
964
965 tmp = static_cast<int32_t>(tmp_f + 0.5f);
966 }
967 else
968 {
969 if(tmp >= 0)
970 {
971 tmp >>= n;
972 }
973 else
974 {
975 uint32_t mask = (1u << n) - 1;
976 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
977 }
978 }
979 if(is_sat)
980 {
981 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
982 }
983 *(output_ptr + x) = static_cast<int16_t>(tmp);
984 }
985 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000986 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100987}
988
Omar Al Khatib605a9282022-11-01 17:01:24 +0000989template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000990inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n)
SiCong Libb88f892020-08-28 11:18:47 +0100991{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000992 const int32x2_t input1_1 = vget_low_s32(src1);
993 const int32x2_t input2_1 = vget_low_s32(src2);
994 const int32x2_t input1_2 = vget_high_s32(src1);
995 const int32x2_t input2_2 = vget_high_s32(src2);
SiCong Libb88f892020-08-28 11:18:47 +0100996
997 int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
998 int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
999
1000 // Apply scaling, conversion and rounding (round to zero)
1001 // Right shift amount
1002 const int64x2_t vn = vdupq_n_s64(-n);
1003 // Left shift amount
1004 const int64x2_t vnl = vdupq_n_s64(n);
1005 // Calculate conversion bit
1006 const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1);
1007 const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63);
1008 const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1);
1009 const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
1010
1011 const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2);
1012 const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);
1013 const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);
1014 const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
1015 if(is_sat)
1016 {
1017 tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
1018 tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
1019 return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
1020 }
1021 else
1022 {
1023 tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
1024 tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
1025 return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
1026 }
1027}
1028
Omar Al Khatib605a9282022-11-01 17:01:24 +00001029template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001030inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
SiCong Libb88f892020-08-28 11:18:47 +01001031{
1032 const int32x4x2_t result =
1033 {
1034 {
1035 // First 4 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001036 mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
SiCong Libb88f892020-08-28 11:18:47 +01001037 // Second 4 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001038 mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)
SiCong Libb88f892020-08-28 11:18:47 +01001039 }
1040 };
1041
1042 return result;
1043}
1044
1045template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001046void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
SiCong Libb88f892020-08-28 11:18:47 +01001047{
1048 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001049 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1050 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
SiCong Libb88f892020-08-28 11:18:47 +01001051
1052 // Clear X Dimension on execution window as we handle manually
SiCong Lid6d1b362020-09-24 17:34:23 +01001053 Window win = window;
SiCong Libb88f892020-08-28 11:18:47 +01001054 win.set(Window::DimX, Window::Dimension(0, 1, 1));
SiCong Libb88f892020-08-28 11:18:47 +01001055
SiCong Lid6d1b362020-09-24 17:34:23 +01001056 const int window_step_x = 8;
1057 const auto window_start_x = static_cast<int>(window.x().start());
1058 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001059 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
SiCong Libb88f892020-08-28 11:18:47 +01001060
SiCong Lid6d1b362020-09-24 17:34:23 +01001061 if(is_broadcast_across_x)
SiCong Libb88f892020-08-28 11:18:47 +01001062 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001063 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1064 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1065 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001066 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1067 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
SiCong Libb88f892020-08-28 11:18:47 +01001068
SiCong Lid6d1b362020-09-24 17:34:23 +01001069 // Clear X Dimension on execution window as we handle manually
1070 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1071
1072 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1073 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001074 Iterator dst(out, win);
SiCong Lid6d1b362020-09-24 17:34:23 +01001075
Omar Al Khatib605a9282022-11-01 17:01:24 +00001076 execute_window_loop(
1077 win, [&](const Coordinates &)
SiCong Libb88f892020-08-28 11:18:47 +01001078 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001079 const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001080 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
SiCong Libb88f892020-08-28 11:18:47 +01001081
SiCong Lid6d1b362020-09-24 17:34:23 +01001082 const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
1083 const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);
SiCong Libb88f892020-08-28 11:18:47 +01001084
SiCong Lid6d1b362020-09-24 17:34:23 +01001085 // Compute window_step_x elements per iteration
1086 int x = window_start_x;
1087 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1088 {
1089 const int32x4x2_t broadcast_v =
1090 {
1091 {
1092 broadcast_value_vec,
1093 broadcast_value_vec,
1094 }
1095 };
1096 const int32x4x2_t non_broadcast_v =
1097 {
1098 {
1099 vld1q_s32(non_broadcast_input_ptr + x),
1100 vld1q_s32(non_broadcast_input_ptr + x + 4),
1101 }
1102 };
1103 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
1104
1105 vst1q_s32(output_ptr + x, result.val[0]);
1106 vst1q_s32(output_ptr + x + 4, result.val[1]);
1107 }
1108
1109 // Compute left-over elements
1110 for(; x < window_end_x; ++x)
1111 {
1112 int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
1113
1114 if(tmp >= 0)
1115 {
1116 tmp >>= n;
1117 }
1118 else
1119 {
Michalis Spyrou7f8caf72021-05-13 13:35:30 +01001120 uint64_t mask = ((uint64_t)1u << n) - 1;
SiCong Lid6d1b362020-09-24 17:34:23 +01001121 tmp = (tmp + static_cast<int64_t>(mask)) >> n;
1122 }
1123 if(is_sat)
1124 {
1125 tmp = utility::clamp<int64_t, int32_t>(tmp);
1126 }
1127 *(output_ptr + x) = static_cast<int32_t>(tmp);
1128 }
1129 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001130 broadcast_input, non_broadcast_input, dst);
SiCong Lid6d1b362020-09-24 17:34:23 +01001131 }
1132 else
1133 {
1134 // Clear X Dimension on execution window as we handle manually
1135 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1136 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1137
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001138 Iterator input1(src1, input1_win);
1139 Iterator input2(src2, input2_win);
1140 Iterator dst(out, win);
SiCong Lid6d1b362020-09-24 17:34:23 +01001141
Omar Al Khatib605a9282022-11-01 17:01:24 +00001142 execute_window_loop(
1143 win, [&](const Coordinates &)
SiCong Libb88f892020-08-28 11:18:47 +01001144 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001145 const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
1146 const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001147 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
SiCong Libb88f892020-08-28 11:18:47 +01001148
SiCong Lid6d1b362020-09-24 17:34:23 +01001149 // Compute window_step_x elements per iteration
1150 int x = window_start_x;
1151 for(; x <= (window_end_x - window_step_x); x += window_step_x)
SiCong Libb88f892020-08-28 11:18:47 +01001152 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001153 const int32x4x2_t ta1 =
1154 {
1155 {
1156 vld1q_s32(input1_ptr + x),
1157 vld1q_s32(input1_ptr + x + 4),
1158 }
1159 };
1160 const int32x4x2_t ta2 =
1161 {
1162 {
1163 vld1q_s32(input2_ptr + x),
1164 vld1q_s32(input2_ptr + x + 4),
1165 }
1166 };
1167 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
1168
1169 vst1q_s32(output_ptr + x, result.val[0]);
1170 vst1q_s32(output_ptr + x + 4, result.val[1]);
SiCong Libb88f892020-08-28 11:18:47 +01001171 }
SiCong Lid6d1b362020-09-24 17:34:23 +01001172
1173 // Compute left-over elements
1174 for(; x < window_end_x; ++x)
SiCong Libb88f892020-08-28 11:18:47 +01001175 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001176 int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
1177
1178 if(tmp >= 0)
1179 {
1180 tmp >>= n;
1181 }
1182 else
1183 {
Michalis Spyrou7f8caf72021-05-13 13:35:30 +01001184 uint64_t mask = ((uint64_t)1u << n) - 1;
SiCong Lid6d1b362020-09-24 17:34:23 +01001185 tmp = (tmp + static_cast<int64_t>(mask)) >> n;
1186 }
1187 if(is_sat)
1188 {
1189 tmp = utility::clamp<int64_t, int32_t>(tmp);
1190 }
1191 *(output_ptr + x) = static_cast<int32_t>(tmp);
SiCong Libb88f892020-08-28 11:18:47 +01001192 }
SiCong Lid6d1b362020-09-24 17:34:23 +01001193 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001194 input1, input2, dst);
SiCong Lid6d1b362020-09-24 17:34:23 +01001195 }
SiCong Libb88f892020-08-28 11:18:47 +01001196}
1197
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001198void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001199{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001200 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001201 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1202 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001203
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001204 // Clear X Dimension on execution window as we handle manually
1205 Window win = window;
1206 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1207
1208 constexpr int window_step_x = 16 / sizeof(float);
1209 const auto window_start_x = static_cast<int>(window.x().start());
1210 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001211 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001212
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001213 using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
1214
1215 if(is_broadcast_across_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001216 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001217 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1218 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1219 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001220 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1221 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001222
1223 // Clear X Dimension on execution window as we handle manually
1224 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1225
1226 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1227 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001228 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001229
Omar Al Khatib605a9282022-11-01 17:01:24 +00001230 execute_window_loop(
1231 win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001232 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001233 const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001234 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001235
1236 const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
1237 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
1238 const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
1239
1240 // Compute window_step_x elements per iteration
1241 int x = window_start_x;
1242 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1243 {
1244 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
1245 auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
1246 wrapper::vstore(output_ptr + x, res);
1247 }
1248
1249 // Compute left-over elements
1250 for(; x < window_end_x; ++x)
1251 {
1252 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
1253 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
1254 }
1255 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001256 broadcast_input, non_broadcast_input, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001257 }
1258 else
1259 {
1260 // Clear X Dimension on execution window as we handle manually
1261 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1262 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1263
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001264 Iterator input1(src1, input1_win);
1265 Iterator input2(src2, input2_win);
1266 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001267
Omar Al Khatib605a9282022-11-01 17:01:24 +00001268 execute_window_loop(
1269 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001270 {
1271 const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
1272 const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001273 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001274
1275 // Compute window_step_x elements per iteration
1276 int x = window_start_x;
1277 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1278 {
1279 const auto ta1 = wrapper::vloadq(input1_ptr + x);
1280 const auto ta2 = wrapper::vloadq(input2_ptr + x);
1281 const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
1282 const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
1283 wrapper::vstore(output_ptr + x, res);
1284 }
1285
1286 // Compute left-over elements
1287 for(; x < window_end_x; ++x)
1288 {
1289 const auto ta1 = *(input1_ptr + x);
1290 const auto ta2 = *(input2_ptr + x);
1291 *(output_ptr + x) = ta1 * ta2 * scale;
1292 }
1293 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001294 input1, input2, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001295 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001296}
1297
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001298void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window)
giuros01154bc1c2019-03-26 17:44:40 +00001299{
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001300 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001301 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1302 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00001303
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001304 // Clear X Dimension on execution window as we handle manually
1305 Window win = window;
1306 win.set(Window::DimX, Window::Dimension(0, 1, 1));
giuros01154bc1c2019-03-26 17:44:40 +00001307
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001308 constexpr int window_step_x = 8 / sizeof(float);
1309 const auto window_start_x = static_cast<int>(window.x().start());
1310 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001311 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
giuros01154bc1c2019-03-26 17:44:40 +00001312
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001313 using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
1314
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001315 if(is_broadcast_across_x)
1316 {
1317 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1318 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1319 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001320 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1321 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
giuros01154bc1c2019-03-26 17:44:40 +00001322
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001323 // Clear X Dimension on execution window as we handle manually
1324 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
giuros01154bc1c2019-03-26 17:44:40 +00001325
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001326 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1327 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001328 Iterator dst(out, win);
giuros01154bc1c2019-03-26 17:44:40 +00001329
Omar Al Khatib605a9282022-11-01 17:01:24 +00001330 execute_window_loop(
1331 win, [&](const Coordinates &)
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001332 {
1333 const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001334 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
giuros01154bc1c2019-03-26 17:44:40 +00001335
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001336 const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
1337
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001338 // Compute window_step_x elements per iteration
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001339 int x = window_start_x;
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001340 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1341 {
1342 const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
1343 float32x4_t b = vdupq_n_f32(broadcast_value);
1344
1345 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1346 const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
1347 const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
1348 const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
1349 const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
1350
1351 const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
1352 const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
1353
1354 float32x4_t res = wrapper::vmul(tmp0, b);
1355 b = wrapper::vmul(b, mask);
1356
1357 res = wrapper::vmla(res, tmp1, b);
1358 wrapper::vstore(output_ptr + 2 * x, res);
1359 }
1360
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001361 // Compute left-over elements
1362 for(; x < window_end_x; ++x)
1363 {
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001364 const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
1365 const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
1366 auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
1367 auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
1368 *(output_ptr + 2 * x) = res1;
1369 *(output_ptr + 2 * x + 1) = res2;
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001370 }
1371 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001372 broadcast_input, non_broadcast_input, dst);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001373 }
1374 else
1375 {
1376 // Clear X Dimension on execution window as we handle manually
1377 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1378 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1379
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001380 Iterator input1(src1, input1_win);
1381 Iterator input2(src2, input2_win);
1382 Iterator dst(out, win);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001383
Omar Al Khatib605a9282022-11-01 17:01:24 +00001384 execute_window_loop(
1385 win, [&](const Coordinates &)
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001386 {
1387 const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
1388 const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001389 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001390
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001391 // Compute window_step_x elements per iteration
1392 int x = window_start_x;
1393 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1394 {
1395 const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
1396 float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x);
1397
1398 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1399 const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
1400 const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
1401 const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
1402 const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
1403
1404 const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
1405 const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
1406
1407 float32x4_t res = wrapper::vmul(tmp0, b);
1408
1409 b = wrapper::vrev64(b);
1410 b = wrapper::vmul(b, mask);
1411
1412 res = wrapper::vmla(res, tmp1, b);
1413 wrapper::vstore(output_ptr + 2 * x, res);
1414 }
1415
1416 // Compute left-over elements
1417 for(; x < window_end_x; ++x)
1418 {
1419 const auto a0 = *(input1_ptr + 2 * x);
1420 const auto a1 = *(input1_ptr + 2 * x + 1);
1421 const auto b0 = *(input2_ptr + 2 * x);
1422 const auto b1 = *(input2_ptr + 2 * x + 1);
1423 auto res1 = a0 * b0 - a1 * b1;
1424 auto res2 = a0 * b1 + a1 * b0;
1425 *(output_ptr + 2 * x) = res1;
1426 *(output_ptr + 2 * x + 1) = res2;
1427 }
1428 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001429 input1, input2, dst);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001430 }
giuros01154bc1c2019-03-26 17:44:40 +00001431}
1432
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001433#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001434void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001435{
1436 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001437 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1438 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001439
1440 // Clear X Dimension on execution window as we handle manually
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001441 Window win = window;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001442 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001443 constexpr int window_step_x = 16;
1444 const auto window_start_x = static_cast<int>(window.x().start());
1445 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001446 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001447 if(is_broadcast_across_x)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001448 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001449 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1450 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1451 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001452 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1453 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001454 // Clear X Dimension on execution window as we handle manually
1455 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1456 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1457 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001458 Iterator dst(out, win);
Omar Al Khatib605a9282022-11-01 17:01:24 +00001459 execute_window_loop(
1460 win, [&](const Coordinates &)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001461 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001462 const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001463 const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001464 const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
Omar Al Khatib605a9282022-11-01 17:01:24 +00001465 const float16x8x2_t broadcast_value_vec =
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001466 {
1467 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001468 vdupq_n_f16(broadcast_value),
1469 vdupq_n_f16(broadcast_value),
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001470 }
1471 };
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001472 const auto scale_vec = vdupq_n_f16(scale);
1473 // Compute window_step_x elements per iteration
1474 int x = window_start_x;
1475 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001476 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001477 const float16x8x2_t non_broadcast_v =
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001478 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001479 {
1480 vld1q_f16(non_broadcast_input_ptr + x),
1481 vld1q_f16(non_broadcast_input_ptr + x + 8),
1482 }
1483 };
1484 const float16x8x2_t result =
1485 {
1486 {
1487 vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
1488 vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
1489 }
1490 };
1491 vst1q_f16(output_ptr + x, result.val[0]);
1492 vst1q_f16(output_ptr + x + 8, result.val[1]);
1493 }
1494 // Compute left-over elements
1495 for(; x < window_end_x; ++x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001496 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001497 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
1498 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
1499 }
1500 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001501 broadcast_input, non_broadcast_input, dst);
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001502 }
1503 else
1504 {
1505 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1506 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001507 Iterator input1(src1, input1_win);
1508 Iterator input2(src2, input2_win);
1509 Iterator dst(out, win);
Omar Al Khatib605a9282022-11-01 17:01:24 +00001510 execute_window_loop(
1511 win, [&](const Coordinates &)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001512 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001513 const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
1514 const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001515 const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001516 // Compute window_step_x elements per iteration
1517 int x = window_start_x;
1518 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1519 {
1520 const float16x8x2_t ta1 =
1521 {
1522 {
1523 vld1q_f16(input1_ptr + x),
1524 vld1q_f16(input1_ptr + x + 8),
1525 }
1526 };
1527 const float16x8x2_t ta2 =
1528 {
1529 {
1530 vld1q_f16(input2_ptr + x),
1531 vld1q_f16(input2_ptr + x + 8),
1532 }
1533 };
1534 const float16x8_t scale_vec = vdupq_n_f16(scale);
Omar Al Khatib605a9282022-11-01 17:01:24 +00001535 const float16x8x2_t result =
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001536 {
1537 {
1538 vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
1539 vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
1540 }
1541 };
1542 vst1q_f16(output_ptr + x, result.val[0]);
1543 vst1q_f16(output_ptr + x + 8, result.val[1]);
1544 }
1545 // Compute left-over elements
1546 for(; x < window_end_x; ++x)
1547 {
1548 const auto ta1 = *(input1_ptr + x);
1549 const auto ta2 = *(input2_ptr + x);
1550 *(output_ptr + x) = ta1 * ta2 * scale;
1551 }
1552 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001553 input1, input2, dst);
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001554 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001555}
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001556#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellodf246182017-07-03 16:25:09 +01001557
1558template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001559void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001560{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001561 // Create input windows
1562 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001563 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1564 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001565
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001566 // Clear X Dimension on execution window as we handle manually
1567 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1568 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1569 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001570
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001571 Iterator input1(src1, input1_win);
1572 Iterator input2(src2, input2_win);
1573 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001574
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001575 const int window_step_x = 16 / sizeof(uint8_t);
1576 const auto window_start_x = static_cast<int>(window.x().start());
1577 const auto window_end_x = static_cast<int>(window.x().end());
1578
Omar Al Khatib605a9282022-11-01 17:01:24 +00001579 execute_window_loop(
1580 win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001581 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001582 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
1583 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001584 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001585
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001586 // Compute window_step_x elements per iteration
1587 int x = window_start_x;
1588 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001589 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001590 const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
1591 const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
1592
1593 uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
1594 uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
1595 tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
1596 tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
1597
1598 if(is_scale255)
1599 {
1600 tmp_low = scale255_U16_U16(tmp_low);
1601 tmp_high = scale255_U16_U16(tmp_high);
1602 }
1603 else
1604 {
1605 const int16x8_t vn = vdupq_n_s16(-n);
1606
1607 if(is_sat)
1608 {
1609 tmp_low = vqshlq_u16(tmp_low, vn);
1610 tmp_high = vqshlq_u16(tmp_high, vn);
1611 }
1612 else
1613 {
1614 tmp_low = vshlq_u16(tmp_low, vn);
1615 tmp_high = vshlq_u16(tmp_high, vn);
1616 }
1617 }
1618
1619 if(is_sat)
1620 {
1621 static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
1622
1623 tmp_low = vminq_u16(tmp_low, max);
1624 tmp_high = vminq_u16(tmp_high, max);
1625 }
1626
1627 vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
1628 vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001629 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001630
1631 // Compute left-over elements
1632 for(; x < window_end_x; ++x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001633 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001634 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1635
1636 if(is_scale255)
1637 {
1638 float tmp_f = static_cast<float>(tmp) * scale255_constant;
1639 tmp = static_cast<int32_t>(tmp_f + 0.5f);
1640 }
1641 else
1642 {
1643 tmp >>= n;
1644 }
1645
1646 if(is_sat)
1647 {
1648 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
1649 }
1650
1651 *(output_ptr + x) = static_cast<int16_t>(tmp);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001652 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001653 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001654 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001655}
1656
1657template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001658void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001659{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001660 // Create input windows
1661 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001662 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1663 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001664
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001665 // Clear X Dimension on execution window as we handle manually
1666 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1667 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1668 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001669
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001670 Iterator input1(src1, input1_win);
1671 Iterator input2(src2, input2_win);
1672 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001673
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001674 const int window_step_x = 16;
1675 const auto window_start_x = static_cast<int>(window.x().start());
1676 const auto window_end_x = static_cast<int>(window.x().end());
1677
Omar Al Khatib605a9282022-11-01 17:01:24 +00001678 execute_window_loop(
1679 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001680 {
1681 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
1682 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001683 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001684
1685 // Compute window_step_x elements per iteration
1686 int x = window_start_x;
1687 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1688 {
1689 const int16x8x2_t ta1 =
1690 {
1691 {
1692 vld1q_s16(input1_ptr + x),
1693 vld1q_s16(input1_ptr + x + 8),
1694 }
1695 };
1696 const uint8x8x2_t ta2u =
1697 {
1698 {
1699 vld1_u8(input2_ptr + x),
1700 vld1_u8(input2_ptr + x + 8),
1701 }
1702 };
1703 const int16x8x2_t ta2 =
1704 {
1705 {
1706 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
1707 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
1708 }
1709 };
1710
1711 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
1712
1713 vst1q_s16(output_ptr + x, result.val[0]);
1714 vst1q_s16(output_ptr + x + 8, result.val[1]);
1715 }
1716
1717 // Compute left-over elements
1718 for(; x < window_end_x; ++x)
1719 {
1720 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1721
1722 if(is_scale255)
1723 {
1724 float tmp_f = static_cast<float>(tmp) * scale255_constant;
1725
1726 tmp = static_cast<int32_t>(tmp_f + 0.5f);
1727 }
1728 else
1729 {
1730 if(tmp >= 0)
1731 {
1732 tmp >>= n;
1733 }
1734 else
1735 {
1736 uint32_t mask = (1u << n) - 1;
1737 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
1738 }
1739 }
1740 if(is_sat)
1741 {
1742 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
1743 }
1744 *(output_ptr + x) = static_cast<int16_t>(tmp);
1745 }
1746 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001747 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001748}
1749
1750template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001751void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001752{
1753 // Simply swap the two input buffers
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001754 mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001755}
1756} // namespace
1757
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001758void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001759{
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001760 ARM_COMPUTE_UNUSED(rounding_policy);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001761 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
Georgios Pinitasf0dea702017-07-03 18:17:28 +01001762
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001763 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001764
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001765 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001766
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001767 // Auto initialize dst if not initialized
1768 set_shape_if_empty(*dst, out_shape);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001769
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001770 _scale = scale;
1771 _scale_exponent = 0;
1772 _func_quantized = nullptr;
1773 _func_int = nullptr;
1774 _func_float = nullptr;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001775
1776 bool is_scale_255 = false;
1777 // Check and validate scaling factor
1778 if(std::abs(scale - scale255_constant) < 0.00001f)
1779 {
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001780 is_scale_255 = true;
1781 }
1782 else
1783 {
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001784 int exponent = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001785
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001786 std::frexp(scale, &exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001787
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001788 // Store the positive exponent. We know that we compute 1/2^n
1789 // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
1790 _scale_exponent = std::abs(exponent - 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001791 }
1792
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001793 const DataType dt_input1 = src1->data_type();
1794 const DataType dt_input2 = src2->data_type();
1795 const DataType dt_output = dst->data_type();
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001796 const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
1797
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001798 switch(dt_input1)
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001799 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001800 case DataType::QASYMM8:
1801 if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
1802 {
Omar Al Khatib605a9282022-11-01 17:01:24 +00001803 if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
1804 {
1805 _func_quantized = &mul_q8_neon_fixedpoint<uint8_t>;
1806 }
1807 else
1808 {
1809 _func_quantized = &mul_saturate_quantized_8<uint8_t>;
1810 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001811 }
1812 break;
1813 case DataType::QASYMM8_SIGNED:
1814 if(dt_input2 == DataType::QASYMM8_SIGNED)
1815 {
Omar Al Khatib605a9282022-11-01 17:01:24 +00001816 if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
1817 {
1818 _func_quantized = &mul_q8_neon_fixedpoint<int8_t>;
1819 }
1820 else
1821 {
1822 _func_quantized = &mul_saturate_quantized_8<int8_t>;
1823 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001824 }
1825 break;
1826 case DataType::QSYMM16:
1827 if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
1828 {
1829 _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
1830 }
1831 else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
1832 {
1833 _func_int = &mul_QSYMM16_QSYMM16_S32;
1834 }
1835 break;
1836 case DataType::S16:
1837 if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1838 {
1839 if(is_scale_255)
1840 {
1841 _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
1842 }
1843 else
1844 {
1845 _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
1846 }
1847 }
1848 if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1849 {
1850 if(is_scale_255)
1851 {
1852 _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
1853 }
1854 else
1855 {
1856 _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
1857 }
1858 }
1859 break;
SiCong Libb88f892020-08-28 11:18:47 +01001860 case DataType::S32:
1861 if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
1862 {
1863 _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
1864 }
1865 break;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001866 case DataType::U8:
1867 if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
1868 {
1869 if(is_scale_255)
1870 {
1871 _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
1872 }
1873 else
1874 {
1875 _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
1876 }
1877 }
1878 else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1879 {
1880 if(is_scale_255)
1881 {
1882 _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
1883 }
1884 else
1885 {
1886 _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
1887 }
1888 }
1889 else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1890 {
1891 if(is_scale_255)
1892 {
1893 _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
1894 }
1895 else
1896 {
1897 _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
1898 }
1899 }
1900 break;
1901#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1902 case DataType::F16:
1903 _func_float = &mul_F16_F16_F16;
1904 break;
1905#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1906 case DataType::F32:
1907 _func_float = &mul_F32_F32_F32;
1908 break;
1909 default:
1910 ARM_COMPUTE_ERROR("You called with the wrong img formats");
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001911 }
1912
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001913 // Configure kernel window
Viet-Hoa Do0d05b662022-09-09 15:39:05 +01001914 Window win;
1915 std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src1, *src2);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001916
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001917 ICpuKernel::configure(win);
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001918}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001919
Fadi Arafeh73bb6b72022-10-06 16:20:14 +00001920size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
1921{
1922 ARM_COMPUTE_UNUSED(thread_count);
1923
1924#if defined(ENABLE_FP32_KERNELS)
1925 if(this->_func_float == &mul_F32_F32_F32)
1926 {
1927 size_t mws = ICPPKernel::default_mws;
1928 if(platform.get_cpu_model() == CPUModel::N1)
1929 {
1930 mws = default_mws_N1_fp32_neon;
1931 }
1932 else if(platform.get_cpu_model() == CPUModel::V1)
1933 {
1934 mws = default_mws_V1_fp32_neon;
1935 }
1936 else
1937 {
fadara01e112ef12022-11-22 18:25:55 +00001938 if(_split_dimension == Window::DimX)
1939 {
1940 // Don't split the work load too small if the tensor has been reinterpreted as 1D.
1941 // This number is loosely chosen as threading overhead in each platform varies wildly.
1942 return default_mws_other_platforms_1d_tensor;
1943 }
1944 return default_mws;
Fadi Arafeh73bb6b72022-10-06 16:20:14 +00001945 }
1946
1947 // tensor is 1D or was re-interpreted as 1D
1948 if(this->window().shape().num_dimensions() == 1)
1949 {
1950 return mws;
1951 }
1952 else
1953 {
1954 // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
1955 // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
1956 // but the other sizes are large, which boosts performance.
1957 mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
1958 return std::max(static_cast<size_t>(1), mws);
1959 }
1960 }
1961#else /* ENABLE_FP32_KERNELS */
1962 ARM_COMPUTE_UNUSED(platform);
1963#endif /* ENABLE_FP32_KERNELS */
fadara01e112ef12022-11-22 18:25:55 +00001964 if(_split_dimension == Window::DimX)
1965 {
1966 // Don't split the work load too small if the tensor has been reinterpreted as 1D.
1967 // This number is loosely chosen as threading overhead in each platform varies wildly.
1968 return default_mws_other_platforms_1d_tensor;
1969 }
1970 return default_mws;
Fadi Arafeh73bb6b72022-10-06 16:20:14 +00001971}
1972
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001973Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
1974 RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001975{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001976 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
1977 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001978
Georgios Pinitas631c41a2017-12-06 11:53:03 +00001979 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001980}
1981
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001982void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001983{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001984 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001985 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001986 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001987
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001988 auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
1989 auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
1990 auto dst = tensors.get_tensor(TensorType::ACL_DST);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001991
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001992 if(_func_quantized != nullptr)
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001993 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001994 (*_func_quantized)(src1, src2, dst, window, _scale);
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001995 }
1996 else if(_func_int != nullptr)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001997 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001998 (*_func_int)(src1, src2, dst, window, _scale_exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001999 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002000 else
2001 {
2002 ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002003 (*_func_float)(src1, src2, dst, window, _scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01002004 }
2005}
Viet-Hoa Dod4a9cc02022-11-08 12:01:21 +00002006
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002007const char *CpuMulKernel::name() const
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002008{
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002009 return "CpuMulKernel";
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002010}
Viet-Hoa Dod4a9cc02022-11-08 12:01:21 +00002011
giuros01154bc1c2019-03-26 17:44:40 +00002012namespace
2013{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002014Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00002015{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002016 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32);
2017 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32);
giuros01154bc1c2019-03-26 17:44:40 +00002018
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002019 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00002020
2021 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
2022
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002023 // Validate in case of configured dst
2024 if(dst->total_size() > 0)
giuros01154bc1c2019-03-26 17:44:40 +00002025 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002026 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);
2027 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
giuros01154bc1c2019-03-26 17:44:40 +00002028 }
2029
2030 return Status{};
2031}
Sheri Zhang4d91dc62020-09-23 11:22:50 +01002032} // namespace
giuros01154bc1c2019-03-26 17:44:40 +00002033
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002034void CpuComplexMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00002035{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002036 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
2037 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst));
Sheri Zhang4d91dc62020-09-23 11:22:50 +01002038
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002039 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00002040
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002041 // Auto initialize dst if not initialized
2042 const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());
2043 auto_init_if_empty(*dst, out_info);
giuros01154bc1c2019-03-26 17:44:40 +00002044
giuros01154bc1c2019-03-26 17:44:40 +00002045 // Configure kernel window
SiCongLic7b1e842021-02-22 14:28:33 +00002046 Window win = calculate_max_window(out_shape);
giuros01154bc1c2019-03-26 17:44:40 +00002047
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002048 ICpuKernel::configure(win);
giuros01154bc1c2019-03-26 17:44:40 +00002049}
2050
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002051Status CpuComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00002052{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002053 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
2054 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst));
giuros01154bc1c2019-03-26 17:44:40 +00002055
2056 return Status{};
2057}
2058
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002059void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
giuros01154bc1c2019-03-26 17:44:40 +00002060{
2061 ARM_COMPUTE_UNUSED(info);
2062 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002063 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
giuros01154bc1c2019-03-26 17:44:40 +00002064
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002065 auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
2066 auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
2067 auto dst = tensors.get_tensor(TensorType::ACL_DST);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01002068
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002069 c_mul_F32_F32_F32_n(src1, src2, dst, window);
giuros01154bc1c2019-03-26 17:44:40 +00002070}
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002071
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002072const char *CpuComplexMulKernel::name() const
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002073{
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002074 return "CpuComplexMulKernel";
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002075}
2076} // namespace kernels
2077} // namespace cpu
Manuel Bottini79fa9a22019-02-22 17:54:22 +00002078} // namespace arm_compute