blob: 81bb85c3dd3307a2bab88be5350814e126ad483b [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Viet-Hoa Do0d05b662022-09-09 15:39:05 +01002 * Copyright (c) 2016-2022 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas7891a732021-08-20 21:39:25 +010024#include "src/cpu/kernels/CpuMulKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
Sheri Zhang1e3ab422021-03-16 17:35:08 +000026#include "arm_compute/core/ITensor.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/TensorInfo.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010028#include "src/core/CPP/Validate.h"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010029#include "src/core/NEON/NEAsymm.h"
30#include "src/core/NEON/NESymm.h"
31#include "src/core/NEON/wrapper/wrapper.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010032#include "src/core/helpers/AutoConfiguration.h"
33#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034
35#include <arm_neon.h>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010036
Fadi Arafeh73bb6b72022-10-06 16:20:14 +000037namespace
38{
39 static constexpr size_t default_mws_N1_fp32_neon = 22447;
40 static constexpr size_t default_mws_V1_fp32_neon = 38982;
41}
Anthony Barbier6ff3b192017-09-04 18:44:23 +010042namespace arm_compute
43{
Sheri Zhang1e3ab422021-03-16 17:35:08 +000044namespace cpu
45{
46namespace kernels
47{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010048namespace
49{
50const float scale255_constant = 1.f / 255.f;
51const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
52const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
53
Sheri Zhang1e3ab422021-03-16 17:35:08 +000054inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000055{
56 ARM_COMPUTE_UNUSED(overflow_policy);
57 ARM_COMPUTE_UNUSED(rounding_policy);
58
Sheri Zhang1e3ab422021-03-16 17:35:08 +000059 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);
60 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
SiCong Libb88f892020-08-28 11:18:47 +010061 DataType::F32);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000062 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
SiCong Libb88f892020-08-28 11:18:47 +010063 DataType::F32);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000064 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
Michele Di Giorgio9428a182020-03-30 14:10:20 +010065 DataType::S16, DataType::QSYMM16,
66 DataType::S32, DataType::F16, DataType::F32);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000067 if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
Pablo Tello52ea9c22019-12-10 11:28:53 +000068 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +000069 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
Georgios Pinitasd7d7e902019-12-18 15:40:54 +000070 ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
Pablo Tello52ea9c22019-12-10 11:28:53 +000071 }
Manuel Bottini79fa9a22019-02-22 17:54:22 +000072
Sheri Zhang1e3ab422021-03-16 17:35:08 +000073 if(dst->total_size() > 0)
Manuel Bottini79fa9a22019-02-22 17:54:22 +000074 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +000075 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
76 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
Manuel Bottini79fa9a22019-02-22 17:54:22 +000077 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
SiCong Libb88f892020-08-28 11:18:47 +010078 // clang-format off
79 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Sheri Zhang1e3ab422021-03-16 17:35:08 +000080 !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) &&
81 !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
82 !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) &&
83 !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
84 !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
85 !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
SiCong Libb88f892020-08-28 11:18:47 +010086 , "Invalid data type combination");
87 // clang-format on
Sheri Zhang1e3ab422021-03-16 17:35:08 +000088 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");
Manuel Bottini79fa9a22019-02-22 17:54:22 +000089 }
Michalis Spyrou861f0db2018-02-26 16:47:58 +000090
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000091 if(std::abs(scale - scale255_constant) < 0.00001f)
92 {
93 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000094 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,
95 "Scale == 1/255 is not supported if input and dst are of data type S32");
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000096 }
97 else
98 {
99 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
100
101 int exponent = 0;
102 const float normalized_mantissa = std::frexp(scale, &exponent);
103
104 // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
105 // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
106 // Moreover, it will be negative as we deal with 1/2^n
107 ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
108 }
109
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000110 return Status{};
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +0000111}
112
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100113/* Scales a given vector by 1/255.
114 *
115 * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
116 *
117 * @param in Input vector to scale.
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000118 * @return Scaled dst rounded to nearest (round half up).
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100119 */
120inline int32x4_t scale255_S32_S32(int32x4_t in)
121{
122 // Scale
123 const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
124 // Round to nearest (round half up)
125 // Add +0.5 for all values
126 // Afterwards vcvt rounds toward zero
127 return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
128}
129
130inline uint16x8_t scale255_U16_U16(uint16x8_t in)
131{
132 const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
133 const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
134 return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
135}
136
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100137template <typename T>
138inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type
139vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000140{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100141 return vquantize_signed(val, info);
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000142}
143
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100144template <typename T>
145inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type
146vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Pablo Tello52ea9c22019-12-10 11:28:53 +0000147{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100148 return vquantize(val, info);
Pablo Tello52ea9c22019-12-10 11:28:53 +0000149}
150
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100151template <typename T>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000152void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100153{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100154 // Create input windows
155 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000156 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
157 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100158
159 // Clear X Dimension on execution window as we handle manually
160 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100161
Sheri Zhanga449a362020-07-16 15:52:25 +0100162 const int window_step_x = 16 / sizeof(T);
163 const auto window_start_x = static_cast<int>(window.x().start());
164 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000165 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100166
Sheri Zhanga449a362020-07-16 15:52:25 +0100167 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
168 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100169
Sheri Zhanga449a362020-07-16 15:52:25 +0100170 if(is_broadcast_across_x)
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100171 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100172 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
173 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
174 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000175 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
176 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Sheri Zhanga449a362020-07-16 15:52:25 +0100177 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
178 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100179
Sheri Zhanga449a362020-07-16 15:52:25 +0100180 // Clear X Dimension on execution window as we handle manually
181 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
182
183 Iterator broadcast_input(broadcast_tensor, broadcast_win);
184 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000185 Iterator dst(out, win);
Sheri Zhanga449a362020-07-16 15:52:25 +0100186
187 using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
188
Omar Al Khatibb230a1f2022-11-01 17:01:24 +0000189 execute_window_loop(
190 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100191 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100192 const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000193 const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100194
Sheri Zhanga449a362020-07-16 15:52:25 +0100195 const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
196 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100197
Sheri Zhanga449a362020-07-16 15:52:25 +0100198 // Compute window_step_x elements per iteration
199 int x = window_start_x;
200 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100201 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100202 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100203
Sheri Zhanga449a362020-07-16 15:52:25 +0100204 // Dequantize inputs
205 const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
206 const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100207
Sheri Zhanga449a362020-07-16 15:52:25 +0100208 const float32x4x4_t out_f32x4x4 =
209 {
210 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
211 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
212 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
213 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
214 };
215
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000216 // Quantize dst
Sheri Zhanga449a362020-07-16 15:52:25 +0100217 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
218 wrapper::vstore(output_ptr + x, result);
219 }
220
221 // Compute left-over elements
222 for(; x < window_end_x; ++x)
223 {
224 // Dequantize inputs
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000225 const T src1 = *(non_broadcast_input_ptr + x);
226 const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100227 const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
228 const float tmp_f = tmp_in1 * tmp_in2;
Sheri Zhanga449a362020-07-16 15:52:25 +0100229
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000230 // Quantize dst
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100231 const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
Sheri Zhanga449a362020-07-16 15:52:25 +0100232 *(output_ptr + x) = tmp_qua;
233 }
234 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000235 broadcast_input, non_broadcast_input, dst);
Sheri Zhanga449a362020-07-16 15:52:25 +0100236 }
237 else
238 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000239 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
240 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
Sheri Zhanga449a362020-07-16 15:52:25 +0100241
242 // Clear X Dimension on execution window as we handle manually
243 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
244 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
245
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000246 Iterator input1(src1, input1_win);
247 Iterator input2(src2, input2_win);
248 Iterator dst(out, win);
Sheri Zhanga449a362020-07-16 15:52:25 +0100249
Omar Al Khatib605a9282022-11-01 17:01:24 +0000250 execute_window_loop(
251 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100252 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100253 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
254 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000255 const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100256
Sheri Zhanga449a362020-07-16 15:52:25 +0100257 // Compute window_step_x elements per iteration
258 int x = window_start_x;
259 for(; x <= (window_end_x - window_step_x); x += window_step_x)
260 {
261 const auto input1_q = wrapper::vloadq(input1_ptr + x);
262 const auto input2_q = wrapper::vloadq(input2_ptr + x);
263
264 // Dequantize inputs
265 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
266 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
267
268 const float32x4x4_t out_f32x4x4 =
269 {
270 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
271 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
272 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
273 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
274 };
275
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000276 // Quantize dst
Sheri Zhanga449a362020-07-16 15:52:25 +0100277 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
278 wrapper::vstore(output_ptr + x, result);
279 }
280
281 // Compute left-over elements
282 for(; x < window_end_x; ++x)
283 {
284 // Dequantize inputs
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000285 const T src1 = *(input1_ptr + x);
286 const T src2 = *(input2_ptr + x);
287 const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
288 const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100289 const float tmp_f = tmp_in1 * tmp_in2;
Sheri Zhanga449a362020-07-16 15:52:25 +0100290
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000291 // Quantize dst
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100292 const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
Sheri Zhanga449a362020-07-16 15:52:25 +0100293 *(output_ptr + x) = tmp_qua;
294 }
295 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000296 input1, input2, dst);
Sheri Zhanga449a362020-07-16 15:52:25 +0100297 }
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100298}
299
Omar Al Khatib605a9282022-11-01 17:01:24 +0000300bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, float scale)
301{
302 const auto iq0 = src0->quantization_info().uniform();
303 const auto iq1 = src1->quantization_info().uniform();
304 const auto oq = dst->quantization_info().uniform();
305
306 const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale;
307
308 if(multiplier < -8191.f || multiplier > 8191.f)
309 {
310 //The multiplier cannot be stored as a 14.18 signed fixed-point number
311 return false;
312 }
313
314 const auto offset_out = float(oq.offset);
315
316 const auto max_result = multiplier * (256) * (256) + offset_out;
317
318 if(max_result > 8191.f)
319 {
320 //It might not be possible to store the result as a 14.18 signed fixed-point number.
321 return false;
322 }
323
324 return true;
325}
326
327template <typename ScalarType>
328void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const Window &window, float scale)
329{
330 const auto in0_info = src0->info();
331 const auto in1_info = src1->info();
332
333 const auto &in0_shape = in0_info->tensor_shape();
334 const auto &in1_shape = in1_info->tensor_shape();
335
336 // Create input windows.
337 Window in0_win = window.broadcast_if_dimension_le_one(in0_shape);
338 Window in1_win = window.broadcast_if_dimension_le_one(in1_shape);
339
340 // Clear the x dimension on the execution window as we process the whole row each iteration.
341 Window win = window;
342 win.set(Window::DimX, Window::Dimension(0, 1, 1));
343
344 constexpr int window_step_x = 16;
345 const auto window_start_x = window.x().start();
346 const auto window_end_x = window.x().end();
347 const auto is_broadcast_across_x = in0_shape.x() != in1_shape.x();
348
349 const auto iq0_info = in0_info->quantization_info().uniform();
350 const auto iq1_info = in1_info->quantization_info().uniform();
351 const auto oq_info = dst->info()->quantization_info().uniform();
352
353 const auto in0_offset = iq0_info.offset;
354 const auto in1_offset = iq1_info.offset;
355 const auto out_offset = oq_info.offset;
356 const auto multiplier = ((iq0_info.scale * iq1_info.scale) / oq_info.scale) * scale;
357
358 constexpr int32_t two_pwr18i = 262144;
359 constexpr float two_pwr18f = 262144.f;
360
361 const auto in0_offset_16p0 = static_cast<int16_t>(in0_offset);
362 const auto in1_offset_16p0 = static_cast<int16_t>(in1_offset);
363 const auto out_offset_14p18 = static_cast<int32_t>(out_offset * two_pwr18i);
364 const auto multiplier_14p18 = static_cast<int32_t>(multiplier * two_pwr18f);
365
366 if(is_broadcast_across_x)
367 {
368 // Prefix: a = non-broadcast, b = broadcast.
369
370 const auto is_broadcast_input_1 = in1_win.x().step() == 0;
371 auto a_win = is_broadcast_input_1 ? in0_win : in1_win;
372 auto b_win = is_broadcast_input_1 ? in1_win : in0_win;
373 const auto a_tensor = is_broadcast_input_1 ? src0 : src1;
374 const auto b_tensor = is_broadcast_input_1 ? src1 : src0;
375
376 const auto a_offset_16p0 = is_broadcast_input_1 ? in0_offset_16p0 : in1_offset_16p0;
377 const auto b_offset_16p0 = is_broadcast_input_1 ? in1_offset : in0_offset;
378#ifndef __aarch64__
379 const auto a_offset = is_broadcast_input_1 ? in0_offset : in1_offset;
380 const auto b_offset = is_broadcast_input_1 ? in1_offset : in0_offset;
381#endif //__aarch64__
382 const auto a_voffset_16p0 = wrapper::vdup_n(a_offset_16p0, wrapper::traits::vector_64_tag());
383
384 // Clear the x dimension on the execution window as we process the whole row each iteration.
385 a_win.set(Window::DimX, Window::Dimension(0, 1, 1));
386
387 Iterator a_input_it(a_tensor, a_win);
388 Iterator b_input_it(b_tensor, b_win);
389 Iterator out_it(dst, win);
390
391 execute_window_loop(
392 win, [&](const Coordinates &)
393 {
394 const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
395 const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
396 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
397
398 const auto b_val = *b_ptr;
399 const auto b_offseted_32p0 = static_cast<int32_t>(b_val - b_offset_16p0);
400 const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());
401
402 const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
403 const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
404
405 int x = window_start_x;
406
407 for(; x <= (window_end_x - window_step_x); x += window_step_x)
408 {
409 // Load the inputs.
410 const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
411
412 // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
413 const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
414 const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
415
416 const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);
417 const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);
418 const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);
419 const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);
420
421 const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);
422 const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);
423 const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);
424 const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);
425
426 const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
427 const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
428 const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
429 const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
430
431 // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
432 const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
433 const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
434 const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
435 const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
436
437 const auto vout_15p1_0 = wrapper::vcombine(
438 vout_15p1_00,
439 vout_15p1_01);
440
441 const auto vout_15p1_1 = wrapper::vcombine(
442 vout_15p1_10,
443 vout_15p1_11);
444 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
445
446 const auto vout_8p0 = wrapper::vcombine(
447 wrapper::vqrshrn<2>(vout_15p1_0),
448 wrapper::vqrshrn<2>(vout_15p1_1));
449 wrapper::vstore(out_ptr + x, vout_8p0);
450 }
451
452 //Process the left-over elements.
453 for(; x < window_end_x; ++x)
454 {
455#ifdef __aarch64__
Omar Al Khatibb230a1f2022-11-01 17:01:24 +0000456 out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(
Omar Al Khatib605a9282022-11-01 17:01:24 +0000457 b_val) - b_offset_16p0)) + out_offset_14p18)));
458#else //__aarch64__
459 out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));
460#endif //__aarch64__
461 }
462 },
463 a_input_it, b_input_it, out_it);
464 }
465 else
466 {
467 const auto voffset0_16p0 = wrapper::vdup_n(in0_offset_16p0, wrapper::traits::vector_64_tag());
468 const auto voffset1_16p0 = wrapper::vdup_n(in1_offset_16p0, wrapper::traits::vector_64_tag());
469 const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
470 const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
471
472 // Clear the x dimension on the execution window as we process the whole row each iteration.
473 in0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
474 in1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
475
476 Iterator in0_it(src0, in0_win);
477 Iterator in1_it(src1, in1_win);
478 Iterator out_it(dst, win);
479
480 execute_window_loop(
481 win, [&](const Coordinates &)
482 {
483 const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
484 const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
485 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
486
487 int x = window_start_x;
488
489 for(; x <= (window_end_x - window_step_x); x += window_step_x)
490 {
491 // Load the inputs.
492 const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
493 const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
494
495 // Widen the input elements to signed 16-bit regardless of the input signedness.
496 const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
497 const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
498 const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
499 const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
500
501 const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);
502 const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);
503 const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);
504 const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);
505
506 const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);
507 const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);
508 const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);
509 const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);
510
511 const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);
512 const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);
513 const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);
514 const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);
515
516 const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
517 const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
518 const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
519 const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
520
521 // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
522 const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
523 const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
524 const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
525 const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
526
527 const auto vout_14p2_0 = wrapper::vcombine(
528 vout_14p2_00,
529 vout_14p2_01);
530
531 const auto vout_14p2_1 = wrapper::vcombine(
532 vout_14p2_10,
533 vout_14p2_11);
534
535 const auto vout_8p0 = wrapper::vcombine(
536 wrapper::vqrshrn<2>(vout_14p2_0),
537 wrapper::vqrshrn<2>(vout_14p2_1));
538 wrapper::vstore(out_ptr + x, vout_8p0);
539 }
540
541 //Process the left-over elements.
542 for(; x < window_end_x; ++x)
543 {
544#ifdef __aarch64__
Omar Al Khatibb230a1f2022-11-01 17:01:24 +0000545 out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) * (int32_t(
Omar Al Khatib605a9282022-11-01 17:01:24 +0000546 in1_ptr[x]) - in1_offset_16p0)) + out_offset_14p18)));
547#else //__aarch64__
548 out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) + float(out_offset)));
549#endif //__aarch64__
550 }
551 },
552 in0_it, in1_it, out_it);
553 }
554}
555
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000556void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100557{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000558 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
559 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100560 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
561
562 // Create input windows
563 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000564 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
565 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100566
567 // Clear X Dimension on execution window as we handle manually
568 win.set(Window::DimX, Window::Dimension(0, 1, 1));
569 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
570 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
571
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000572 Iterator input1(src1, input1_win);
573 Iterator input2(src2, input2_win);
574 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100575
576 const int window_step_x = 16;
577 const auto window_start_x = static_cast<int>(window.x().start());
578 const auto window_end_x = static_cast<int>(window.x().end());
579
580 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
581
Omar Al Khatib605a9282022-11-01 17:01:24 +0000582 execute_window_loop(
583 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100584 {
585 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
586 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000587 const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100588
589 // Compute window_step_x elements per iteration
590 int x = window_start_x;
591 for(; x <= (window_end_x - window_step_x); x += window_step_x)
592 {
593 const qsymm16x8x2_t input1_q =
594 {
595 {
596 vld1q_s16(input1_ptr + x),
597 vld1q_s16(input1_ptr + x + 8),
598 }
599 };
600 const qsymm16x8x2_t input2_q =
601 {
602 {
603 vld1q_s16(input2_ptr + x),
604 vld1q_s16(input2_ptr + x + 8),
605 }
606 };
607
608 // Dequantize inputs
609 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
610 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
611
612 const float32x4x4_t out_f32x4x4 =
613 {
614 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
615 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
616 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
617 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
618 };
619
620 const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
621 vst1q_s16(output_ptr + x, result.val[0]);
622 vst1q_s16(output_ptr + x + 8, result.val[1]);
623 }
624
625 // Compute left-over elements
626 for(; x < window_end_x; ++x)
627 {
628 // Dequantize inputs
629 float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
630 float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
631 float tmp_f = tmp_in1 * tmp_in2;
632
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000633 // Quantize dst, lrintf() has same rounding mode as vcombine_s16
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100634 int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
635 qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
636 *(output_ptr + x) = tmp_qua;
637 }
638 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000639 input1, input2, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100640}
641
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000642void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100643{
644 ARM_COMPUTE_UNUSED(scale);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100645
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100646 // Create input windows
647 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000648 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
649 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100650
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100651 // Clear X Dimension on execution window as we handle manually
652 win.set(Window::DimX, Window::Dimension(0, 1, 1));
653 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
654 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100655
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000656 Iterator input1(src1, input1_win);
657 Iterator input2(src2, input2_win);
658 Iterator dst(out, win);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100659
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100660 const int window_step_x = 16;
661 const auto window_start_x = static_cast<int>(window.x().start());
662 const auto window_end_x = static_cast<int>(window.x().end());
663
Omar Al Khatib605a9282022-11-01 17:01:24 +0000664 execute_window_loop(
665 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100666 {
667 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
668 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000669 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100670
671 // Compute window_step_x elements per iteration
672 int x = window_start_x;
673 for(; x <= (window_end_x - window_step_x); x += window_step_x)
674 {
675 const qsymm16x8x2_t input1_q =
676 {
677 {
678 vld1q_s16(input1_ptr + x),
679 vld1q_s16(input1_ptr + x + 8),
680 }
681 };
682 const qsymm16x8x2_t input2_q =
683 {
684 {
685 vld1q_s16(input2_ptr + x),
686 vld1q_s16(input2_ptr + x + 8),
687 }
688 };
689
690 const int32x4x4_t in1_s32 =
691 {
692 {
693 vmovl_s16(vget_low_s16(input1_q.val[0])),
694 vmovl_s16(vget_high_s16(input1_q.val[0])),
695 vmovl_s16(vget_low_s16(input1_q.val[1])),
696 vmovl_s16(vget_high_s16(input1_q.val[1])),
697 }
698 };
699 const int32x4x4_t in2_s32 =
700 {
701 {
702 vmovl_s16(vget_low_s16(input2_q.val[0])),
703 vmovl_s16(vget_high_s16(input2_q.val[0])),
704 vmovl_s16(vget_low_s16(input2_q.val[1])),
705 vmovl_s16(vget_high_s16(input2_q.val[1])),
706 }
707 };
708
709 const int32x4x4_t result =
710 {
711 {
712 vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
713 vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
714 vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
715 vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
716 }
717 };
718
719 vst1q_s32(output_ptr + x, result.val[0]);
720 vst1q_s32(output_ptr + x + 4, result.val[1]);
721 vst1q_s32(output_ptr + x + 8, result.val[2]);
722 vst1q_s32(output_ptr + x + 12, result.val[3]);
723 }
724
725 // Compute left-over elements
726 for(; x < window_end_x; ++x)
727 {
728 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
729 *(output_ptr + x) = tmp;
730 }
731 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000732 input1, input2, dst);
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100733}
734
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100735template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000736void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100737{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100738 // Create input windows
739 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000740 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
741 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100742
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100743 // Clear X Dimension on execution window as we handle manually
744 win.set(Window::DimX, Window::Dimension(0, 1, 1));
745 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
746 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100747
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000748 Iterator input1(src1, input1_win);
749 Iterator input2(src2, input2_win);
750 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100751
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100752 const int window_step_x = 16 / sizeof(uint8_t);
753 const auto window_start_x = static_cast<int>(window.x().start());
754 const auto window_end_x = static_cast<int>(window.x().end());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100755
Omar Al Khatib605a9282022-11-01 17:01:24 +0000756 execute_window_loop(
757 win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100758 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100759 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
760 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000761 const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100762
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100763 // Compute window_step_x elements per iteration
764 int x = window_start_x;
765 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100766 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100767 const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
768 const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100769
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100770 uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
771 const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
772 uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
773 const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
774
775 tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
776 tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
777
778 if(is_scale255)
779 {
780 tmp1_high = scale255_U16_U16(tmp1_high);
781 tmp1_low = scale255_U16_U16(tmp1_low);
782 }
783 else
784 {
785 const int16x8_t vn = vdupq_n_s16(-n);
786
787 if(is_sat)
788 {
789 tmp1_high = vqshlq_u16(tmp1_high, vn);
790 tmp1_low = vqshlq_u16(tmp1_low, vn);
791 }
792 else
793 {
794 tmp1_high = vshlq_u16(tmp1_high, vn);
795 tmp1_low = vshlq_u16(tmp1_low, vn);
796 }
797 }
798 if(is_sat)
799 {
Viet-Hoa Do0d05b662022-09-09 15:39:05 +0100800 vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100801 }
802 else
803 {
Viet-Hoa Do0d05b662022-09-09 15:39:05 +0100804 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100805 }
806 }
807
808 // Compute left-over elements
809 for(; x < window_end_x; ++x)
810 {
811 uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
812
813 if(is_scale255)
814 {
815 float tmp_f = static_cast<float>(tmp) * scale255_constant;
816 tmp = static_cast<uint16_t>(tmp_f + 0.5f);
817 }
818 else
819 {
820 tmp >>= n;
821 }
822 if(is_sat && tmp > 255)
823 {
824 tmp = 255;
825 }
826 *(output_ptr + x) = static_cast<uint8_t>(tmp);
827 }
828 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000829 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100830}
831
832template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000833inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100834{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000835 int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1));
836 const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));
837 int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1));
838 const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100839
840 tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
841 tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
842
843 if(is_scale255)
844 {
845 tmp1_high = scale255_S32_S32(tmp1_high);
846 tmp1_low = scale255_S32_S32(tmp1_low);
847 }
848 else
849 {
850 // Right shift amount
851 const int32x4_t vn = vdupq_n_s32(-n);
852 // Left shift amount
853 const int32x4_t vnl = vdupq_n_s32(n);
854 // Calculate conversion bit
855 const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
856 const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
857 const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
858 const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
859 const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
860 const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
861 const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
862 const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
863 if(is_sat)
864 {
865 tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
866 tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
867 }
868 else
869 {
870 tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
871 tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
872 }
873 }
874
875 if(is_sat)
876 {
877 return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
878 }
879 else
880 {
881 return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
882 }
883}
884
885template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000886inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100887{
888 const int16x8x2_t result =
889 {
890 {
891 // First 8 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000892 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100893 // Second 8 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000894 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100895 }
896 };
897
898 return result;
899}
900
901template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000902void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100903{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100904 // Create input windows
905 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000906 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
907 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100908
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100909 // Clear X Dimension on execution window as we handle manually
910 win.set(Window::DimX, Window::Dimension(0, 1, 1));
911 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
912 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100913
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000914 Iterator input1(src1, input1_win);
915 Iterator input2(src2, input2_win);
916 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100917
918 const int window_step_x = 16;
919 const auto window_start_x = static_cast<int>(window.x().start());
920 const auto window_end_x = static_cast<int>(window.x().end());
921
Omar Al Khatib605a9282022-11-01 17:01:24 +0000922 execute_window_loop(
923 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100924 {
925 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
926 const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000927 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100928
929 // Compute window_step_x elements per iteration
930 int x = window_start_x;
931 for(; x <= (window_end_x - window_step_x); x += window_step_x)
932 {
933 const int16x8x2_t ta1 =
934 {
935 {
936 vld1q_s16(input1_ptr + x),
937 vld1q_s16(input1_ptr + x + 8),
938 }
939 };
940 const int16x8x2_t ta2 =
941 {
942 {
943 vld1q_s16(input2_ptr + x),
944 vld1q_s16(input2_ptr + x + 8),
945 }
946 };
947 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
948
949 vst1q_s16(output_ptr + x, result.val[0]);
950 vst1q_s16(output_ptr + x + 8, result.val[1]);
951 }
952
953 // Compute left-over elements
954 for(; x < window_end_x; ++x)
955 {
956 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
957
958 if(is_scale255)
959 {
960 float tmp_f = static_cast<float>(tmp) * scale255_constant;
961
962 tmp = static_cast<int32_t>(tmp_f + 0.5f);
963 }
964 else
965 {
966 if(tmp >= 0)
967 {
968 tmp >>= n;
969 }
970 else
971 {
972 uint32_t mask = (1u << n) - 1;
973 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
974 }
975 }
976 if(is_sat)
977 {
978 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
979 }
980 *(output_ptr + x) = static_cast<int16_t>(tmp);
981 }
982 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000983 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100984}
985
Omar Al Khatib605a9282022-11-01 17:01:24 +0000986template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000987inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n)
SiCong Libb88f892020-08-28 11:18:47 +0100988{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000989 const int32x2_t input1_1 = vget_low_s32(src1);
990 const int32x2_t input2_1 = vget_low_s32(src2);
991 const int32x2_t input1_2 = vget_high_s32(src1);
992 const int32x2_t input2_2 = vget_high_s32(src2);
SiCong Libb88f892020-08-28 11:18:47 +0100993
994 int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
995 int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
996
997 // Apply scaling, conversion and rounding (round to zero)
998 // Right shift amount
999 const int64x2_t vn = vdupq_n_s64(-n);
1000 // Left shift amount
1001 const int64x2_t vnl = vdupq_n_s64(n);
1002 // Calculate conversion bit
1003 const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1);
1004 const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63);
1005 const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1);
1006 const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
1007
1008 const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2);
1009 const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);
1010 const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);
1011 const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
1012 if(is_sat)
1013 {
1014 tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
1015 tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
1016 return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
1017 }
1018 else
1019 {
1020 tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
1021 tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
1022 return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
1023 }
1024}
1025
Omar Al Khatib605a9282022-11-01 17:01:24 +00001026template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001027inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
SiCong Libb88f892020-08-28 11:18:47 +01001028{
1029 const int32x4x2_t result =
1030 {
1031 {
1032 // First 4 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001033 mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
SiCong Libb88f892020-08-28 11:18:47 +01001034 // Second 4 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001035 mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)
SiCong Libb88f892020-08-28 11:18:47 +01001036 }
1037 };
1038
1039 return result;
1040}
1041
1042template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001043void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
SiCong Libb88f892020-08-28 11:18:47 +01001044{
1045 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001046 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1047 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
SiCong Libb88f892020-08-28 11:18:47 +01001048
1049 // Clear X Dimension on execution window as we handle manually
SiCong Lid6d1b362020-09-24 17:34:23 +01001050 Window win = window;
SiCong Libb88f892020-08-28 11:18:47 +01001051 win.set(Window::DimX, Window::Dimension(0, 1, 1));
SiCong Libb88f892020-08-28 11:18:47 +01001052
SiCong Lid6d1b362020-09-24 17:34:23 +01001053 const int window_step_x = 8;
1054 const auto window_start_x = static_cast<int>(window.x().start());
1055 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001056 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
SiCong Libb88f892020-08-28 11:18:47 +01001057
SiCong Lid6d1b362020-09-24 17:34:23 +01001058 if(is_broadcast_across_x)
SiCong Libb88f892020-08-28 11:18:47 +01001059 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001060 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1061 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1062 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001063 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1064 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
SiCong Libb88f892020-08-28 11:18:47 +01001065
SiCong Lid6d1b362020-09-24 17:34:23 +01001066 // Clear X Dimension on execution window as we handle manually
1067 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1068
1069 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1070 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001071 Iterator dst(out, win);
SiCong Lid6d1b362020-09-24 17:34:23 +01001072
Omar Al Khatib605a9282022-11-01 17:01:24 +00001073 execute_window_loop(
1074 win, [&](const Coordinates &)
SiCong Libb88f892020-08-28 11:18:47 +01001075 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001076 const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001077 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
SiCong Libb88f892020-08-28 11:18:47 +01001078
SiCong Lid6d1b362020-09-24 17:34:23 +01001079 const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
1080 const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);
SiCong Libb88f892020-08-28 11:18:47 +01001081
SiCong Lid6d1b362020-09-24 17:34:23 +01001082 // Compute window_step_x elements per iteration
1083 int x = window_start_x;
1084 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1085 {
1086 const int32x4x2_t broadcast_v =
1087 {
1088 {
1089 broadcast_value_vec,
1090 broadcast_value_vec,
1091 }
1092 };
1093 const int32x4x2_t non_broadcast_v =
1094 {
1095 {
1096 vld1q_s32(non_broadcast_input_ptr + x),
1097 vld1q_s32(non_broadcast_input_ptr + x + 4),
1098 }
1099 };
1100 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
1101
1102 vst1q_s32(output_ptr + x, result.val[0]);
1103 vst1q_s32(output_ptr + x + 4, result.val[1]);
1104 }
1105
1106 // Compute left-over elements
1107 for(; x < window_end_x; ++x)
1108 {
1109 int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
1110
1111 if(tmp >= 0)
1112 {
1113 tmp >>= n;
1114 }
1115 else
1116 {
Michalis Spyrou7f8caf72021-05-13 13:35:30 +01001117 uint64_t mask = ((uint64_t)1u << n) - 1;
SiCong Lid6d1b362020-09-24 17:34:23 +01001118 tmp = (tmp + static_cast<int64_t>(mask)) >> n;
1119 }
1120 if(is_sat)
1121 {
1122 tmp = utility::clamp<int64_t, int32_t>(tmp);
1123 }
1124 *(output_ptr + x) = static_cast<int32_t>(tmp);
1125 }
1126 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001127 broadcast_input, non_broadcast_input, dst);
SiCong Lid6d1b362020-09-24 17:34:23 +01001128 }
1129 else
1130 {
1131 // Clear X Dimension on execution window as we handle manually
1132 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1133 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1134
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001135 Iterator input1(src1, input1_win);
1136 Iterator input2(src2, input2_win);
1137 Iterator dst(out, win);
SiCong Lid6d1b362020-09-24 17:34:23 +01001138
Omar Al Khatib605a9282022-11-01 17:01:24 +00001139 execute_window_loop(
1140 win, [&](const Coordinates &)
SiCong Libb88f892020-08-28 11:18:47 +01001141 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001142 const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
1143 const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001144 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
SiCong Libb88f892020-08-28 11:18:47 +01001145
SiCong Lid6d1b362020-09-24 17:34:23 +01001146 // Compute window_step_x elements per iteration
1147 int x = window_start_x;
1148 for(; x <= (window_end_x - window_step_x); x += window_step_x)
SiCong Libb88f892020-08-28 11:18:47 +01001149 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001150 const int32x4x2_t ta1 =
1151 {
1152 {
1153 vld1q_s32(input1_ptr + x),
1154 vld1q_s32(input1_ptr + x + 4),
1155 }
1156 };
1157 const int32x4x2_t ta2 =
1158 {
1159 {
1160 vld1q_s32(input2_ptr + x),
1161 vld1q_s32(input2_ptr + x + 4),
1162 }
1163 };
1164 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
1165
1166 vst1q_s32(output_ptr + x, result.val[0]);
1167 vst1q_s32(output_ptr + x + 4, result.val[1]);
SiCong Libb88f892020-08-28 11:18:47 +01001168 }
SiCong Lid6d1b362020-09-24 17:34:23 +01001169
1170 // Compute left-over elements
1171 for(; x < window_end_x; ++x)
SiCong Libb88f892020-08-28 11:18:47 +01001172 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001173 int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
1174
1175 if(tmp >= 0)
1176 {
1177 tmp >>= n;
1178 }
1179 else
1180 {
Michalis Spyrou7f8caf72021-05-13 13:35:30 +01001181 uint64_t mask = ((uint64_t)1u << n) - 1;
SiCong Lid6d1b362020-09-24 17:34:23 +01001182 tmp = (tmp + static_cast<int64_t>(mask)) >> n;
1183 }
1184 if(is_sat)
1185 {
1186 tmp = utility::clamp<int64_t, int32_t>(tmp);
1187 }
1188 *(output_ptr + x) = static_cast<int32_t>(tmp);
SiCong Libb88f892020-08-28 11:18:47 +01001189 }
SiCong Lid6d1b362020-09-24 17:34:23 +01001190 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001191 input1, input2, dst);
SiCong Lid6d1b362020-09-24 17:34:23 +01001192 }
SiCong Libb88f892020-08-28 11:18:47 +01001193}
1194
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001195void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001196{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001197 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001198 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1199 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001200
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001201 // Clear X Dimension on execution window as we handle manually
1202 Window win = window;
1203 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1204
1205 constexpr int window_step_x = 16 / sizeof(float);
1206 const auto window_start_x = static_cast<int>(window.x().start());
1207 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001208 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001209
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001210 using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
1211
1212 if(is_broadcast_across_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001213 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001214 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1215 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1216 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001217 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1218 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001219
1220 // Clear X Dimension on execution window as we handle manually
1221 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1222
1223 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1224 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001225 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001226
Omar Al Khatib605a9282022-11-01 17:01:24 +00001227 execute_window_loop(
1228 win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001229 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001230 const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001231 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001232
1233 const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
1234 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
1235 const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
1236
1237 // Compute window_step_x elements per iteration
1238 int x = window_start_x;
1239 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1240 {
1241 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
1242 auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
1243 wrapper::vstore(output_ptr + x, res);
1244 }
1245
1246 // Compute left-over elements
1247 for(; x < window_end_x; ++x)
1248 {
1249 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
1250 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
1251 }
1252 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001253 broadcast_input, non_broadcast_input, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001254 }
1255 else
1256 {
1257 // Clear X Dimension on execution window as we handle manually
1258 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1259 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1260
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001261 Iterator input1(src1, input1_win);
1262 Iterator input2(src2, input2_win);
1263 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001264
Omar Al Khatib605a9282022-11-01 17:01:24 +00001265 execute_window_loop(
1266 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001267 {
1268 const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
1269 const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001270 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001271
1272 // Compute window_step_x elements per iteration
1273 int x = window_start_x;
1274 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1275 {
1276 const auto ta1 = wrapper::vloadq(input1_ptr + x);
1277 const auto ta2 = wrapper::vloadq(input2_ptr + x);
1278 const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
1279 const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
1280 wrapper::vstore(output_ptr + x, res);
1281 }
1282
1283 // Compute left-over elements
1284 for(; x < window_end_x; ++x)
1285 {
1286 const auto ta1 = *(input1_ptr + x);
1287 const auto ta2 = *(input2_ptr + x);
1288 *(output_ptr + x) = ta1 * ta2 * scale;
1289 }
1290 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001291 input1, input2, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001292 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001293}
1294
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001295void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window)
giuros01154bc1c2019-03-26 17:44:40 +00001296{
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001297 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001298 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1299 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00001300
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001301 // Clear X Dimension on execution window as we handle manually
1302 Window win = window;
1303 win.set(Window::DimX, Window::Dimension(0, 1, 1));
giuros01154bc1c2019-03-26 17:44:40 +00001304
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001305 constexpr int window_step_x = 8 / sizeof(float);
1306 const auto window_start_x = static_cast<int>(window.x().start());
1307 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001308 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
giuros01154bc1c2019-03-26 17:44:40 +00001309
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001310 using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
1311
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001312 if(is_broadcast_across_x)
1313 {
1314 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1315 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1316 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001317 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1318 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
giuros01154bc1c2019-03-26 17:44:40 +00001319
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001320 // Clear X Dimension on execution window as we handle manually
1321 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
giuros01154bc1c2019-03-26 17:44:40 +00001322
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001323 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1324 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001325 Iterator dst(out, win);
giuros01154bc1c2019-03-26 17:44:40 +00001326
Omar Al Khatib605a9282022-11-01 17:01:24 +00001327 execute_window_loop(
1328 win, [&](const Coordinates &)
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001329 {
1330 const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001331 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
giuros01154bc1c2019-03-26 17:44:40 +00001332
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001333 const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
1334
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001335 // Compute window_step_x elements per iteration
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001336 int x = window_start_x;
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001337 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1338 {
1339 const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
1340 float32x4_t b = vdupq_n_f32(broadcast_value);
1341
1342 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1343 const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
1344 const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
1345 const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
1346 const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
1347
1348 const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
1349 const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
1350
1351 float32x4_t res = wrapper::vmul(tmp0, b);
1352 b = wrapper::vmul(b, mask);
1353
1354 res = wrapper::vmla(res, tmp1, b);
1355 wrapper::vstore(output_ptr + 2 * x, res);
1356 }
1357
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001358 // Compute left-over elements
1359 for(; x < window_end_x; ++x)
1360 {
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001361 const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
1362 const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
1363 auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
1364 auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
1365 *(output_ptr + 2 * x) = res1;
1366 *(output_ptr + 2 * x + 1) = res2;
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001367 }
1368 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001369 broadcast_input, non_broadcast_input, dst);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001370 }
1371 else
1372 {
1373 // Clear X Dimension on execution window as we handle manually
1374 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1375 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1376
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001377 Iterator input1(src1, input1_win);
1378 Iterator input2(src2, input2_win);
1379 Iterator dst(out, win);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001380
Omar Al Khatib605a9282022-11-01 17:01:24 +00001381 execute_window_loop(
1382 win, [&](const Coordinates &)
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001383 {
1384 const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
1385 const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001386 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001387
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001388 // Compute window_step_x elements per iteration
1389 int x = window_start_x;
1390 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1391 {
1392 const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
1393 float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x);
1394
1395 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1396 const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
1397 const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
1398 const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
1399 const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
1400
1401 const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
1402 const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
1403
1404 float32x4_t res = wrapper::vmul(tmp0, b);
1405
1406 b = wrapper::vrev64(b);
1407 b = wrapper::vmul(b, mask);
1408
1409 res = wrapper::vmla(res, tmp1, b);
1410 wrapper::vstore(output_ptr + 2 * x, res);
1411 }
1412
1413 // Compute left-over elements
1414 for(; x < window_end_x; ++x)
1415 {
1416 const auto a0 = *(input1_ptr + 2 * x);
1417 const auto a1 = *(input1_ptr + 2 * x + 1);
1418 const auto b0 = *(input2_ptr + 2 * x);
1419 const auto b1 = *(input2_ptr + 2 * x + 1);
1420 auto res1 = a0 * b0 - a1 * b1;
1421 auto res2 = a0 * b1 + a1 * b0;
1422 *(output_ptr + 2 * x) = res1;
1423 *(output_ptr + 2 * x + 1) = res2;
1424 }
1425 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001426 input1, input2, dst);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001427 }
giuros01154bc1c2019-03-26 17:44:40 +00001428}
1429
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001430#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001431void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001432{
1433 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001434 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1435 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001436
1437 // Clear X Dimension on execution window as we handle manually
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001438 Window win = window;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001439 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001440 constexpr int window_step_x = 16;
1441 const auto window_start_x = static_cast<int>(window.x().start());
1442 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001443 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001444 if(is_broadcast_across_x)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001445 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001446 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1447 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1448 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001449 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1450 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001451 // Clear X Dimension on execution window as we handle manually
1452 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1453 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1454 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001455 Iterator dst(out, win);
Omar Al Khatib605a9282022-11-01 17:01:24 +00001456 execute_window_loop(
1457 win, [&](const Coordinates &)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001458 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001459 const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001460 const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001461 const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
Omar Al Khatib605a9282022-11-01 17:01:24 +00001462 const float16x8x2_t broadcast_value_vec =
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001463 {
1464 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001465 vdupq_n_f16(broadcast_value),
1466 vdupq_n_f16(broadcast_value),
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001467 }
1468 };
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001469 const auto scale_vec = vdupq_n_f16(scale);
1470 // Compute window_step_x elements per iteration
1471 int x = window_start_x;
1472 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001473 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001474 const float16x8x2_t non_broadcast_v =
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001475 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001476 {
1477 vld1q_f16(non_broadcast_input_ptr + x),
1478 vld1q_f16(non_broadcast_input_ptr + x + 8),
1479 }
1480 };
1481 const float16x8x2_t result =
1482 {
1483 {
1484 vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
1485 vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
1486 }
1487 };
1488 vst1q_f16(output_ptr + x, result.val[0]);
1489 vst1q_f16(output_ptr + x + 8, result.val[1]);
1490 }
1491 // Compute left-over elements
1492 for(; x < window_end_x; ++x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001493 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001494 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
1495 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
1496 }
1497 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001498 broadcast_input, non_broadcast_input, dst);
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001499 }
1500 else
1501 {
1502 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1503 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001504 Iterator input1(src1, input1_win);
1505 Iterator input2(src2, input2_win);
1506 Iterator dst(out, win);
Omar Al Khatib605a9282022-11-01 17:01:24 +00001507 execute_window_loop(
1508 win, [&](const Coordinates &)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001509 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001510 const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
1511 const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001512 const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001513 // Compute window_step_x elements per iteration
1514 int x = window_start_x;
1515 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1516 {
1517 const float16x8x2_t ta1 =
1518 {
1519 {
1520 vld1q_f16(input1_ptr + x),
1521 vld1q_f16(input1_ptr + x + 8),
1522 }
1523 };
1524 const float16x8x2_t ta2 =
1525 {
1526 {
1527 vld1q_f16(input2_ptr + x),
1528 vld1q_f16(input2_ptr + x + 8),
1529 }
1530 };
1531 const float16x8_t scale_vec = vdupq_n_f16(scale);
Omar Al Khatib605a9282022-11-01 17:01:24 +00001532 const float16x8x2_t result =
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001533 {
1534 {
1535 vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
1536 vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
1537 }
1538 };
1539 vst1q_f16(output_ptr + x, result.val[0]);
1540 vst1q_f16(output_ptr + x + 8, result.val[1]);
1541 }
1542 // Compute left-over elements
1543 for(; x < window_end_x; ++x)
1544 {
1545 const auto ta1 = *(input1_ptr + x);
1546 const auto ta2 = *(input2_ptr + x);
1547 *(output_ptr + x) = ta1 * ta2 * scale;
1548 }
1549 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001550 input1, input2, dst);
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001551 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001552}
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001553#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellodf246182017-07-03 16:25:09 +01001554
1555template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001556void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001557{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001558 // Create input windows
1559 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001560 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1561 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001562
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001563 // Clear X Dimension on execution window as we handle manually
1564 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1565 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1566 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001567
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001568 Iterator input1(src1, input1_win);
1569 Iterator input2(src2, input2_win);
1570 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001571
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001572 const int window_step_x = 16 / sizeof(uint8_t);
1573 const auto window_start_x = static_cast<int>(window.x().start());
1574 const auto window_end_x = static_cast<int>(window.x().end());
1575
Omar Al Khatib605a9282022-11-01 17:01:24 +00001576 execute_window_loop(
1577 win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001578 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001579 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
1580 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001581 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001582
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001583 // Compute window_step_x elements per iteration
1584 int x = window_start_x;
1585 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001586 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001587 const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
1588 const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
1589
1590 uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
1591 uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
1592 tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
1593 tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
1594
1595 if(is_scale255)
1596 {
1597 tmp_low = scale255_U16_U16(tmp_low);
1598 tmp_high = scale255_U16_U16(tmp_high);
1599 }
1600 else
1601 {
1602 const int16x8_t vn = vdupq_n_s16(-n);
1603
1604 if(is_sat)
1605 {
1606 tmp_low = vqshlq_u16(tmp_low, vn);
1607 tmp_high = vqshlq_u16(tmp_high, vn);
1608 }
1609 else
1610 {
1611 tmp_low = vshlq_u16(tmp_low, vn);
1612 tmp_high = vshlq_u16(tmp_high, vn);
1613 }
1614 }
1615
1616 if(is_sat)
1617 {
1618 static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
1619
1620 tmp_low = vminq_u16(tmp_low, max);
1621 tmp_high = vminq_u16(tmp_high, max);
1622 }
1623
1624 vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
1625 vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001626 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001627
1628 // Compute left-over elements
1629 for(; x < window_end_x; ++x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001630 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001631 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1632
1633 if(is_scale255)
1634 {
1635 float tmp_f = static_cast<float>(tmp) * scale255_constant;
1636 tmp = static_cast<int32_t>(tmp_f + 0.5f);
1637 }
1638 else
1639 {
1640 tmp >>= n;
1641 }
1642
1643 if(is_sat)
1644 {
1645 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
1646 }
1647
1648 *(output_ptr + x) = static_cast<int16_t>(tmp);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001649 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001650 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001651 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001652}
1653
1654template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001655void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001656{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001657 // Create input windows
1658 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001659 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1660 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001661
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001662 // Clear X Dimension on execution window as we handle manually
1663 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1664 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1665 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001666
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001667 Iterator input1(src1, input1_win);
1668 Iterator input2(src2, input2_win);
1669 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001670
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001671 const int window_step_x = 16;
1672 const auto window_start_x = static_cast<int>(window.x().start());
1673 const auto window_end_x = static_cast<int>(window.x().end());
1674
Omar Al Khatib605a9282022-11-01 17:01:24 +00001675 execute_window_loop(
1676 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001677 {
1678 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
1679 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001680 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001681
1682 // Compute window_step_x elements per iteration
1683 int x = window_start_x;
1684 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1685 {
1686 const int16x8x2_t ta1 =
1687 {
1688 {
1689 vld1q_s16(input1_ptr + x),
1690 vld1q_s16(input1_ptr + x + 8),
1691 }
1692 };
1693 const uint8x8x2_t ta2u =
1694 {
1695 {
1696 vld1_u8(input2_ptr + x),
1697 vld1_u8(input2_ptr + x + 8),
1698 }
1699 };
1700 const int16x8x2_t ta2 =
1701 {
1702 {
1703 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
1704 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
1705 }
1706 };
1707
1708 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
1709
1710 vst1q_s16(output_ptr + x, result.val[0]);
1711 vst1q_s16(output_ptr + x + 8, result.val[1]);
1712 }
1713
1714 // Compute left-over elements
1715 for(; x < window_end_x; ++x)
1716 {
1717 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1718
1719 if(is_scale255)
1720 {
1721 float tmp_f = static_cast<float>(tmp) * scale255_constant;
1722
1723 tmp = static_cast<int32_t>(tmp_f + 0.5f);
1724 }
1725 else
1726 {
1727 if(tmp >= 0)
1728 {
1729 tmp >>= n;
1730 }
1731 else
1732 {
1733 uint32_t mask = (1u << n) - 1;
1734 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
1735 }
1736 }
1737 if(is_sat)
1738 {
1739 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
1740 }
1741 *(output_ptr + x) = static_cast<int16_t>(tmp);
1742 }
1743 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001744 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001745}
1746
1747template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001748void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001749{
1750 // Simply swap the two input buffers
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001751 mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001752}
1753} // namespace
1754
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001755void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001756{
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001757 ARM_COMPUTE_UNUSED(rounding_policy);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001758 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
Georgios Pinitasf0dea702017-07-03 18:17:28 +01001759
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001760 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001761
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001762 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001763
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001764 // Auto initialize dst if not initialized
1765 set_shape_if_empty(*dst, out_shape);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001766
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001767 _scale = scale;
1768 _scale_exponent = 0;
1769 _func_quantized = nullptr;
1770 _func_int = nullptr;
1771 _func_float = nullptr;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001772
1773 bool is_scale_255 = false;
1774 // Check and validate scaling factor
1775 if(std::abs(scale - scale255_constant) < 0.00001f)
1776 {
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001777 is_scale_255 = true;
1778 }
1779 else
1780 {
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001781 int exponent = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001782
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001783 std::frexp(scale, &exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001784
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001785 // Store the positive exponent. We know that we compute 1/2^n
1786 // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
1787 _scale_exponent = std::abs(exponent - 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001788 }
1789
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001790 const DataType dt_input1 = src1->data_type();
1791 const DataType dt_input2 = src2->data_type();
1792 const DataType dt_output = dst->data_type();
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001793 const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
1794
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001795 switch(dt_input1)
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001796 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001797 case DataType::QASYMM8:
1798 if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
1799 {
Omar Al Khatib605a9282022-11-01 17:01:24 +00001800 if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
1801 {
1802 _func_quantized = &mul_q8_neon_fixedpoint<uint8_t>;
1803 }
1804 else
1805 {
1806 _func_quantized = &mul_saturate_quantized_8<uint8_t>;
1807 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001808 }
1809 break;
1810 case DataType::QASYMM8_SIGNED:
1811 if(dt_input2 == DataType::QASYMM8_SIGNED)
1812 {
Omar Al Khatib605a9282022-11-01 17:01:24 +00001813 if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
1814 {
1815 _func_quantized = &mul_q8_neon_fixedpoint<int8_t>;
1816 }
1817 else
1818 {
1819 _func_quantized = &mul_saturate_quantized_8<int8_t>;
1820 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001821 }
1822 break;
1823 case DataType::QSYMM16:
1824 if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
1825 {
1826 _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
1827 }
1828 else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
1829 {
1830 _func_int = &mul_QSYMM16_QSYMM16_S32;
1831 }
1832 break;
1833 case DataType::S16:
1834 if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1835 {
1836 if(is_scale_255)
1837 {
1838 _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
1839 }
1840 else
1841 {
1842 _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
1843 }
1844 }
1845 if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1846 {
1847 if(is_scale_255)
1848 {
1849 _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
1850 }
1851 else
1852 {
1853 _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
1854 }
1855 }
1856 break;
SiCong Libb88f892020-08-28 11:18:47 +01001857 case DataType::S32:
1858 if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
1859 {
1860 _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
1861 }
1862 break;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001863 case DataType::U8:
1864 if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
1865 {
1866 if(is_scale_255)
1867 {
1868 _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
1869 }
1870 else
1871 {
1872 _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
1873 }
1874 }
1875 else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1876 {
1877 if(is_scale_255)
1878 {
1879 _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
1880 }
1881 else
1882 {
1883 _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
1884 }
1885 }
1886 else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1887 {
1888 if(is_scale_255)
1889 {
1890 _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
1891 }
1892 else
1893 {
1894 _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
1895 }
1896 }
1897 break;
1898#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1899 case DataType::F16:
1900 _func_float = &mul_F16_F16_F16;
1901 break;
1902#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1903 case DataType::F32:
1904 _func_float = &mul_F32_F32_F32;
1905 break;
1906 default:
1907 ARM_COMPUTE_ERROR("You called with the wrong img formats");
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001908 }
1909
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001910 // Configure kernel window
Viet-Hoa Do0d05b662022-09-09 15:39:05 +01001911 Window win;
1912 std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src1, *src2);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001913
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001914 ICpuKernel::configure(win);
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001915}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001916
Fadi Arafeh73bb6b72022-10-06 16:20:14 +00001917size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
1918{
1919 ARM_COMPUTE_UNUSED(thread_count);
1920
1921#if defined(ENABLE_FP32_KERNELS)
1922 if(this->_func_float == &mul_F32_F32_F32)
1923 {
1924 size_t mws = ICPPKernel::default_mws;
1925 if(platform.get_cpu_model() == CPUModel::N1)
1926 {
1927 mws = default_mws_N1_fp32_neon;
1928 }
1929 else if(platform.get_cpu_model() == CPUModel::V1)
1930 {
1931 mws = default_mws_V1_fp32_neon;
1932 }
1933 else
1934 {
1935 return ICPPKernel::default_mws;
1936 }
1937
1938 // tensor is 1D or was re-interpreted as 1D
1939 if(this->window().shape().num_dimensions() == 1)
1940 {
1941 return mws;
1942 }
1943 else
1944 {
1945 // scale mws down by the number of elements along all the dimensions (x, z, w, etc) except the one
1946 // that we parallelize along (the y dimension). This allows for parallelization when the Y_SIZE is small
1947 // but the other sizes are large, which boosts performance.
1948 mws = static_cast<size_t>(mws / (this->window().num_iterations_total() / this->window().num_iterations(1)));
1949 return std::max(static_cast<size_t>(1), mws);
1950 }
1951 }
1952#else /* ENABLE_FP32_KERNELS */
1953 ARM_COMPUTE_UNUSED(platform);
1954#endif /* ENABLE_FP32_KERNELS */
1955 return ICPPKernel::default_mws;
1956}
1957
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001958Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
1959 RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001960{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001961 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
1962 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001963
Georgios Pinitas631c41a2017-12-06 11:53:03 +00001964 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001965}
1966
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001967void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001968{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001969 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001970 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001971 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001972
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001973 auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
1974 auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
1975 auto dst = tensors.get_tensor(TensorType::ACL_DST);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001976
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001977 if(_func_quantized != nullptr)
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001978 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001979 (*_func_quantized)(src1, src2, dst, window, _scale);
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001980 }
1981 else if(_func_int != nullptr)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001982 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001983 (*_func_int)(src1, src2, dst, window, _scale_exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001984 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001985 else
1986 {
1987 ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001988 (*_func_float)(src1, src2, dst, window, _scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001989 }
1990}
Viet-Hoa Dod4a9cc02022-11-08 12:01:21 +00001991
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001992const char *CpuMulKernel::name() const
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001993{
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001994 return "CpuMulKernel";
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001995}
Viet-Hoa Dod4a9cc02022-11-08 12:01:21 +00001996
1997size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
1998{
1999 ARM_COMPUTE_UNUSED(platform, thread_count);
2000
2001 if(_split_dimension == Window::DimX)
2002 {
2003 // Don't split the work load too small if the tensor has been reinterpreted as 1D.
2004 // This number is loosely chosen as threading overhead in each platform varies wildly.
2005 return 10240;
2006 }
2007
2008 return default_mws;
2009}
2010
giuros01154bc1c2019-03-26 17:44:40 +00002011namespace
2012{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002013Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00002014{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002015 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32);
2016 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32);
giuros01154bc1c2019-03-26 17:44:40 +00002017
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002018 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00002019
2020 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
2021
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002022 // Validate in case of configured dst
2023 if(dst->total_size() > 0)
giuros01154bc1c2019-03-26 17:44:40 +00002024 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002025 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);
2026 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
giuros01154bc1c2019-03-26 17:44:40 +00002027 }
2028
2029 return Status{};
2030}
Sheri Zhang4d91dc62020-09-23 11:22:50 +01002031} // namespace
giuros01154bc1c2019-03-26 17:44:40 +00002032
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002033void CpuComplexMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00002034{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002035 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
2036 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst));
Sheri Zhang4d91dc62020-09-23 11:22:50 +01002037
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002038 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00002039
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002040 // Auto initialize dst if not initialized
2041 const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());
2042 auto_init_if_empty(*dst, out_info);
giuros01154bc1c2019-03-26 17:44:40 +00002043
giuros01154bc1c2019-03-26 17:44:40 +00002044 // Configure kernel window
SiCongLic7b1e842021-02-22 14:28:33 +00002045 Window win = calculate_max_window(out_shape);
giuros01154bc1c2019-03-26 17:44:40 +00002046
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002047 ICpuKernel::configure(win);
giuros01154bc1c2019-03-26 17:44:40 +00002048}
2049
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002050Status CpuComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00002051{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002052 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
2053 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst));
giuros01154bc1c2019-03-26 17:44:40 +00002054
2055 return Status{};
2056}
2057
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002058void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
giuros01154bc1c2019-03-26 17:44:40 +00002059{
2060 ARM_COMPUTE_UNUSED(info);
2061 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002062 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
giuros01154bc1c2019-03-26 17:44:40 +00002063
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002064 auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
2065 auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
2066 auto dst = tensors.get_tensor(TensorType::ACL_DST);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01002067
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002068 c_mul_F32_F32_F32_n(src1, src2, dst, window);
giuros01154bc1c2019-03-26 17:44:40 +00002069}
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002070
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002071const char *CpuComplexMulKernel::name() const
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002072{
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002073 return "CpuComplexMulKernel";
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002074}
2075} // namespace kernels
2076} // namespace cpu
Manuel Bottini79fa9a22019-02-22 17:54:22 +00002077} // namespace arm_compute