blob: 82e544532168cb5082e80b15fff0206cea23e2e2 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Viet-Hoa Do0d05b662022-09-09 15:39:05 +01002 * Copyright (c) 2016-2022 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas7891a732021-08-20 21:39:25 +010024#include "src/cpu/kernels/CpuMulKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
Sheri Zhang1e3ab422021-03-16 17:35:08 +000026#include "arm_compute/core/ITensor.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/TensorInfo.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010028#include "src/core/CPP/Validate.h"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010029#include "src/core/NEON/NEAsymm.h"
30#include "src/core/NEON/NESymm.h"
31#include "src/core/NEON/wrapper/wrapper.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010032#include "src/core/helpers/AutoConfiguration.h"
33#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034
35#include <arm_neon.h>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010036
Anthony Barbier6ff3b192017-09-04 18:44:23 +010037namespace arm_compute
38{
Sheri Zhang1e3ab422021-03-16 17:35:08 +000039namespace cpu
40{
41namespace kernels
42{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010043namespace
44{
45const float scale255_constant = 1.f / 255.f;
46const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
47const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
48
Sheri Zhang1e3ab422021-03-16 17:35:08 +000049inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000050{
51 ARM_COMPUTE_UNUSED(overflow_policy);
52 ARM_COMPUTE_UNUSED(rounding_policy);
53
Sheri Zhang1e3ab422021-03-16 17:35:08 +000054 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);
55 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
SiCong Libb88f892020-08-28 11:18:47 +010056 DataType::F32);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000057 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
SiCong Libb88f892020-08-28 11:18:47 +010058 DataType::F32);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000059 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
Michele Di Giorgio9428a182020-03-30 14:10:20 +010060 DataType::S16, DataType::QSYMM16,
61 DataType::S32, DataType::F16, DataType::F32);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000062 if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
Pablo Tello52ea9c22019-12-10 11:28:53 +000063 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +000064 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
Georgios Pinitasd7d7e902019-12-18 15:40:54 +000065 ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
Pablo Tello52ea9c22019-12-10 11:28:53 +000066 }
Manuel Bottini79fa9a22019-02-22 17:54:22 +000067
Sheri Zhang1e3ab422021-03-16 17:35:08 +000068 if(dst->total_size() > 0)
Manuel Bottini79fa9a22019-02-22 17:54:22 +000069 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +000070 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
71 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
Manuel Bottini79fa9a22019-02-22 17:54:22 +000072 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
SiCong Libb88f892020-08-28 11:18:47 +010073 // clang-format off
74 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Sheri Zhang1e3ab422021-03-16 17:35:08 +000075 !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) &&
76 !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
77 !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) &&
78 !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
79 !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
80 !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
SiCong Libb88f892020-08-28 11:18:47 +010081 , "Invalid data type combination");
82 // clang-format on
Sheri Zhang1e3ab422021-03-16 17:35:08 +000083 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");
Manuel Bottini79fa9a22019-02-22 17:54:22 +000084 }
Michalis Spyrou861f0db2018-02-26 16:47:58 +000085
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000086 if(std::abs(scale - scale255_constant) < 0.00001f)
87 {
88 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000089 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,
90 "Scale == 1/255 is not supported if input and dst are of data type S32");
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000091 }
92 else
93 {
94 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
95
96 int exponent = 0;
97 const float normalized_mantissa = std::frexp(scale, &exponent);
98
99 // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
100 // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
101 // Moreover, it will be negative as we deal with 1/2^n
102 ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
103 }
104
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000105 return Status{};
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +0000106}
107
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100108/* Scales a given vector by 1/255.
109 *
110 * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
111 *
112 * @param in Input vector to scale.
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000113 * @return Scaled dst rounded to nearest (round half up).
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100114 */
115inline int32x4_t scale255_S32_S32(int32x4_t in)
116{
117 // Scale
118 const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
119 // Round to nearest (round half up)
120 // Add +0.5 for all values
121 // Afterwards vcvt rounds toward zero
122 return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
123}
124
125inline uint16x8_t scale255_U16_U16(uint16x8_t in)
126{
127 const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
128 const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
129 return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
130}
131
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100132template <typename T>
133inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type
134vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000135{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100136 return vquantize_signed(val, info);
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000137}
138
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100139template <typename T>
140inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type
141vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Pablo Tello52ea9c22019-12-10 11:28:53 +0000142{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100143 return vquantize(val, info);
Pablo Tello52ea9c22019-12-10 11:28:53 +0000144}
145
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100146template <typename T>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000147void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100148{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100149 // Create input windows
150 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000151 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
152 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100153
154 // Clear X Dimension on execution window as we handle manually
155 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100156
Sheri Zhanga449a362020-07-16 15:52:25 +0100157 const int window_step_x = 16 / sizeof(T);
158 const auto window_start_x = static_cast<int>(window.x().start());
159 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000160 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100161
Sheri Zhanga449a362020-07-16 15:52:25 +0100162 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
163 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100164
Sheri Zhanga449a362020-07-16 15:52:25 +0100165 if(is_broadcast_across_x)
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100166 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100167 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
168 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
169 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000170 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
171 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Sheri Zhanga449a362020-07-16 15:52:25 +0100172 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
173 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100174
Sheri Zhanga449a362020-07-16 15:52:25 +0100175 // Clear X Dimension on execution window as we handle manually
176 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
177
178 Iterator broadcast_input(broadcast_tensor, broadcast_win);
179 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000180 Iterator dst(out, win);
Sheri Zhanga449a362020-07-16 15:52:25 +0100181
182 using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
183
Omar Al Khatibb230a1f2022-11-01 17:01:24 +0000184 execute_window_loop(
185 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100186 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100187 const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000188 const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100189
Sheri Zhanga449a362020-07-16 15:52:25 +0100190 const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
191 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100192
Sheri Zhanga449a362020-07-16 15:52:25 +0100193 // Compute window_step_x elements per iteration
194 int x = window_start_x;
195 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100196 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100197 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100198
Sheri Zhanga449a362020-07-16 15:52:25 +0100199 // Dequantize inputs
200 const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
201 const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100202
Sheri Zhanga449a362020-07-16 15:52:25 +0100203 const float32x4x4_t out_f32x4x4 =
204 {
205 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
206 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
207 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
208 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
209 };
210
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000211 // Quantize dst
Sheri Zhanga449a362020-07-16 15:52:25 +0100212 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
213 wrapper::vstore(output_ptr + x, result);
214 }
215
216 // Compute left-over elements
217 for(; x < window_end_x; ++x)
218 {
219 // Dequantize inputs
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000220 const T src1 = *(non_broadcast_input_ptr + x);
221 const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100222 const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
223 const float tmp_f = tmp_in1 * tmp_in2;
Sheri Zhanga449a362020-07-16 15:52:25 +0100224
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000225 // Quantize dst
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100226 const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
Sheri Zhanga449a362020-07-16 15:52:25 +0100227 *(output_ptr + x) = tmp_qua;
228 }
229 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000230 broadcast_input, non_broadcast_input, dst);
Sheri Zhanga449a362020-07-16 15:52:25 +0100231 }
232 else
233 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000234 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
235 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
Sheri Zhanga449a362020-07-16 15:52:25 +0100236
237 // Clear X Dimension on execution window as we handle manually
238 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
239 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
240
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000241 Iterator input1(src1, input1_win);
242 Iterator input2(src2, input2_win);
243 Iterator dst(out, win);
Sheri Zhanga449a362020-07-16 15:52:25 +0100244
Omar Al Khatib605a9282022-11-01 17:01:24 +0000245 execute_window_loop(
246 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100247 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100248 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
249 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000250 const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100251
Sheri Zhanga449a362020-07-16 15:52:25 +0100252 // Compute window_step_x elements per iteration
253 int x = window_start_x;
254 for(; x <= (window_end_x - window_step_x); x += window_step_x)
255 {
256 const auto input1_q = wrapper::vloadq(input1_ptr + x);
257 const auto input2_q = wrapper::vloadq(input2_ptr + x);
258
259 // Dequantize inputs
260 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
261 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
262
263 const float32x4x4_t out_f32x4x4 =
264 {
265 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
266 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
267 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
268 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
269 };
270
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000271 // Quantize dst
Sheri Zhanga449a362020-07-16 15:52:25 +0100272 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
273 wrapper::vstore(output_ptr + x, result);
274 }
275
276 // Compute left-over elements
277 for(; x < window_end_x; ++x)
278 {
279 // Dequantize inputs
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000280 const T src1 = *(input1_ptr + x);
281 const T src2 = *(input2_ptr + x);
282 const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
283 const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100284 const float tmp_f = tmp_in1 * tmp_in2;
Sheri Zhanga449a362020-07-16 15:52:25 +0100285
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000286 // Quantize dst
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100287 const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
Sheri Zhanga449a362020-07-16 15:52:25 +0100288 *(output_ptr + x) = tmp_qua;
289 }
290 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000291 input1, input2, dst);
Sheri Zhanga449a362020-07-16 15:52:25 +0100292 }
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100293}
294
Omar Al Khatib605a9282022-11-01 17:01:24 +0000295bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, float scale)
296{
297 const auto iq0 = src0->quantization_info().uniform();
298 const auto iq1 = src1->quantization_info().uniform();
299 const auto oq = dst->quantization_info().uniform();
300
301 const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale;
302
303 if(multiplier < -8191.f || multiplier > 8191.f)
304 {
305 //The multiplier cannot be stored as a 14.18 signed fixed-point number
306 return false;
307 }
308
309 const auto offset_out = float(oq.offset);
310
311 const auto max_result = multiplier * (256) * (256) + offset_out;
312
313 if(max_result > 8191.f)
314 {
315 //It might not be possible to store the result as a 14.18 signed fixed-point number.
316 return false;
317 }
318
319 return true;
320}
321
322template <typename ScalarType>
323void mul_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const Window &window, float scale)
324{
325 const auto in0_info = src0->info();
326 const auto in1_info = src1->info();
327
328 const auto &in0_shape = in0_info->tensor_shape();
329 const auto &in1_shape = in1_info->tensor_shape();
330
331 // Create input windows.
332 Window in0_win = window.broadcast_if_dimension_le_one(in0_shape);
333 Window in1_win = window.broadcast_if_dimension_le_one(in1_shape);
334
335 // Clear the x dimension on the execution window as we process the whole row each iteration.
336 Window win = window;
337 win.set(Window::DimX, Window::Dimension(0, 1, 1));
338
339 constexpr int window_step_x = 16;
340 const auto window_start_x = window.x().start();
341 const auto window_end_x = window.x().end();
342 const auto is_broadcast_across_x = in0_shape.x() != in1_shape.x();
343
344 const auto iq0_info = in0_info->quantization_info().uniform();
345 const auto iq1_info = in1_info->quantization_info().uniform();
346 const auto oq_info = dst->info()->quantization_info().uniform();
347
348 const auto in0_offset = iq0_info.offset;
349 const auto in1_offset = iq1_info.offset;
350 const auto out_offset = oq_info.offset;
351 const auto multiplier = ((iq0_info.scale * iq1_info.scale) / oq_info.scale) * scale;
352
353 constexpr int32_t two_pwr18i = 262144;
354 constexpr float two_pwr18f = 262144.f;
355
356 const auto in0_offset_16p0 = static_cast<int16_t>(in0_offset);
357 const auto in1_offset_16p0 = static_cast<int16_t>(in1_offset);
358 const auto out_offset_14p18 = static_cast<int32_t>(out_offset * two_pwr18i);
359 const auto multiplier_14p18 = static_cast<int32_t>(multiplier * two_pwr18f);
360
361 if(is_broadcast_across_x)
362 {
363 // Prefix: a = non-broadcast, b = broadcast.
364
365 const auto is_broadcast_input_1 = in1_win.x().step() == 0;
366 auto a_win = is_broadcast_input_1 ? in0_win : in1_win;
367 auto b_win = is_broadcast_input_1 ? in1_win : in0_win;
368 const auto a_tensor = is_broadcast_input_1 ? src0 : src1;
369 const auto b_tensor = is_broadcast_input_1 ? src1 : src0;
370
371 const auto a_offset_16p0 = is_broadcast_input_1 ? in0_offset_16p0 : in1_offset_16p0;
372 const auto b_offset_16p0 = is_broadcast_input_1 ? in1_offset : in0_offset;
373#ifndef __aarch64__
374 const auto a_offset = is_broadcast_input_1 ? in0_offset : in1_offset;
375 const auto b_offset = is_broadcast_input_1 ? in1_offset : in0_offset;
376#endif //__aarch64__
377 const auto a_voffset_16p0 = wrapper::vdup_n(a_offset_16p0, wrapper::traits::vector_64_tag());
378
379 // Clear the x dimension on the execution window as we process the whole row each iteration.
380 a_win.set(Window::DimX, Window::Dimension(0, 1, 1));
381
382 Iterator a_input_it(a_tensor, a_win);
383 Iterator b_input_it(b_tensor, b_win);
384 Iterator out_it(dst, win);
385
386 execute_window_loop(
387 win, [&](const Coordinates &)
388 {
389 const auto a_ptr = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
390 const auto b_ptr = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
391 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
392
393 const auto b_val = *b_ptr;
394 const auto b_offseted_32p0 = static_cast<int32_t>(b_val - b_offset_16p0);
395 const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());
396
397 const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
398 const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
399
400 int x = window_start_x;
401
402 for(; x <= (window_end_x - window_step_x); x += window_step_x)
403 {
404 // Load the inputs.
405 const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
406
407 // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
408 const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
409 const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
410
411 const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);
412 const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);
413 const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);
414 const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);
415
416 const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);
417 const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);
418 const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);
419 const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);
420
421 const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
422 const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
423 const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
424 const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
425
426 // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
427 const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
428 const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
429 const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
430 const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
431
432 const auto vout_15p1_0 = wrapper::vcombine(
433 vout_15p1_00,
434 vout_15p1_01);
435
436 const auto vout_15p1_1 = wrapper::vcombine(
437 vout_15p1_10,
438 vout_15p1_11);
439 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
440
441 const auto vout_8p0 = wrapper::vcombine(
442 wrapper::vqrshrn<2>(vout_15p1_0),
443 wrapper::vqrshrn<2>(vout_15p1_1));
444 wrapper::vstore(out_ptr + x, vout_8p0);
445 }
446
447 //Process the left-over elements.
448 for(; x < window_end_x; ++x)
449 {
450#ifdef __aarch64__
Omar Al Khatibb230a1f2022-11-01 17:01:24 +0000451 out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(
Omar Al Khatib605a9282022-11-01 17:01:24 +0000452 b_val) - b_offset_16p0)) + out_offset_14p18)));
453#else //__aarch64__
454 out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));
455#endif //__aarch64__
456 }
457 },
458 a_input_it, b_input_it, out_it);
459 }
460 else
461 {
462 const auto voffset0_16p0 = wrapper::vdup_n(in0_offset_16p0, wrapper::traits::vector_64_tag());
463 const auto voffset1_16p0 = wrapper::vdup_n(in1_offset_16p0, wrapper::traits::vector_64_tag());
464 const auto voffsetout_14p18 = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
465 const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
466
467 // Clear the x dimension on the execution window as we process the whole row each iteration.
468 in0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
469 in1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
470
471 Iterator in0_it(src0, in0_win);
472 Iterator in1_it(src1, in1_win);
473 Iterator out_it(dst, win);
474
475 execute_window_loop(
476 win, [&](const Coordinates &)
477 {
478 const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
479 const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
480 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
481
482 int x = window_start_x;
483
484 for(; x <= (window_end_x - window_step_x); x += window_step_x)
485 {
486 // Load the inputs.
487 const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
488 const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
489
490 // Widen the input elements to signed 16-bit regardless of the input signedness.
491 const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
492 const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
493 const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
494 const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
495
496 const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);
497 const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);
498 const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);
499 const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);
500
501 const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);
502 const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);
503 const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);
504 const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);
505
506 const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);
507 const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);
508 const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);
509 const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);
510
511 const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
512 const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
513 const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
514 const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
515
516 // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
517 const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
518 const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
519 const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
520 const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
521
522 const auto vout_14p2_0 = wrapper::vcombine(
523 vout_14p2_00,
524 vout_14p2_01);
525
526 const auto vout_14p2_1 = wrapper::vcombine(
527 vout_14p2_10,
528 vout_14p2_11);
529
530 const auto vout_8p0 = wrapper::vcombine(
531 wrapper::vqrshrn<2>(vout_14p2_0),
532 wrapper::vqrshrn<2>(vout_14p2_1));
533 wrapper::vstore(out_ptr + x, vout_8p0);
534 }
535
536 //Process the left-over elements.
537 for(; x < window_end_x; ++x)
538 {
539#ifdef __aarch64__
Omar Al Khatibb230a1f2022-11-01 17:01:24 +0000540 out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) * (int32_t(
Omar Al Khatib605a9282022-11-01 17:01:24 +0000541 in1_ptr[x]) - in1_offset_16p0)) + out_offset_14p18)));
542#else //__aarch64__
543 out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) + float(out_offset)));
544#endif //__aarch64__
545 }
546 },
547 in0_it, in1_it, out_it);
548 }
549}
550
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000551void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100552{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000553 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
554 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100555 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
556
557 // Create input windows
558 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000559 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
560 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100561
562 // Clear X Dimension on execution window as we handle manually
563 win.set(Window::DimX, Window::Dimension(0, 1, 1));
564 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
565 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
566
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000567 Iterator input1(src1, input1_win);
568 Iterator input2(src2, input2_win);
569 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100570
571 const int window_step_x = 16;
572 const auto window_start_x = static_cast<int>(window.x().start());
573 const auto window_end_x = static_cast<int>(window.x().end());
574
575 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
576
Omar Al Khatib605a9282022-11-01 17:01:24 +0000577 execute_window_loop(
578 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100579 {
580 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
581 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000582 const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100583
584 // Compute window_step_x elements per iteration
585 int x = window_start_x;
586 for(; x <= (window_end_x - window_step_x); x += window_step_x)
587 {
588 const qsymm16x8x2_t input1_q =
589 {
590 {
591 vld1q_s16(input1_ptr + x),
592 vld1q_s16(input1_ptr + x + 8),
593 }
594 };
595 const qsymm16x8x2_t input2_q =
596 {
597 {
598 vld1q_s16(input2_ptr + x),
599 vld1q_s16(input2_ptr + x + 8),
600 }
601 };
602
603 // Dequantize inputs
604 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
605 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
606
607 const float32x4x4_t out_f32x4x4 =
608 {
609 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
610 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
611 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
612 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
613 };
614
615 const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
616 vst1q_s16(output_ptr + x, result.val[0]);
617 vst1q_s16(output_ptr + x + 8, result.val[1]);
618 }
619
620 // Compute left-over elements
621 for(; x < window_end_x; ++x)
622 {
623 // Dequantize inputs
624 float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
625 float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
626 float tmp_f = tmp_in1 * tmp_in2;
627
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000628 // Quantize dst, lrintf() has same rounding mode as vcombine_s16
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100629 int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
630 qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
631 *(output_ptr + x) = tmp_qua;
632 }
633 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000634 input1, input2, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100635}
636
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000637void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100638{
639 ARM_COMPUTE_UNUSED(scale);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100640
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100641 // Create input windows
642 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000643 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
644 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100645
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100646 // Clear X Dimension on execution window as we handle manually
647 win.set(Window::DimX, Window::Dimension(0, 1, 1));
648 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
649 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100650
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000651 Iterator input1(src1, input1_win);
652 Iterator input2(src2, input2_win);
653 Iterator dst(out, win);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100654
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100655 const int window_step_x = 16;
656 const auto window_start_x = static_cast<int>(window.x().start());
657 const auto window_end_x = static_cast<int>(window.x().end());
658
Omar Al Khatib605a9282022-11-01 17:01:24 +0000659 execute_window_loop(
660 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100661 {
662 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
663 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000664 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100665
666 // Compute window_step_x elements per iteration
667 int x = window_start_x;
668 for(; x <= (window_end_x - window_step_x); x += window_step_x)
669 {
670 const qsymm16x8x2_t input1_q =
671 {
672 {
673 vld1q_s16(input1_ptr + x),
674 vld1q_s16(input1_ptr + x + 8),
675 }
676 };
677 const qsymm16x8x2_t input2_q =
678 {
679 {
680 vld1q_s16(input2_ptr + x),
681 vld1q_s16(input2_ptr + x + 8),
682 }
683 };
684
685 const int32x4x4_t in1_s32 =
686 {
687 {
688 vmovl_s16(vget_low_s16(input1_q.val[0])),
689 vmovl_s16(vget_high_s16(input1_q.val[0])),
690 vmovl_s16(vget_low_s16(input1_q.val[1])),
691 vmovl_s16(vget_high_s16(input1_q.val[1])),
692 }
693 };
694 const int32x4x4_t in2_s32 =
695 {
696 {
697 vmovl_s16(vget_low_s16(input2_q.val[0])),
698 vmovl_s16(vget_high_s16(input2_q.val[0])),
699 vmovl_s16(vget_low_s16(input2_q.val[1])),
700 vmovl_s16(vget_high_s16(input2_q.val[1])),
701 }
702 };
703
704 const int32x4x4_t result =
705 {
706 {
707 vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
708 vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
709 vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
710 vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
711 }
712 };
713
714 vst1q_s32(output_ptr + x, result.val[0]);
715 vst1q_s32(output_ptr + x + 4, result.val[1]);
716 vst1q_s32(output_ptr + x + 8, result.val[2]);
717 vst1q_s32(output_ptr + x + 12, result.val[3]);
718 }
719
720 // Compute left-over elements
721 for(; x < window_end_x; ++x)
722 {
723 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
724 *(output_ptr + x) = tmp;
725 }
726 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000727 input1, input2, dst);
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100728}
729
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100730template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000731void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100732{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100733 // Create input windows
734 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000735 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
736 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100737
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100738 // Clear X Dimension on execution window as we handle manually
739 win.set(Window::DimX, Window::Dimension(0, 1, 1));
740 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
741 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100742
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000743 Iterator input1(src1, input1_win);
744 Iterator input2(src2, input2_win);
745 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100746
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100747 const int window_step_x = 16 / sizeof(uint8_t);
748 const auto window_start_x = static_cast<int>(window.x().start());
749 const auto window_end_x = static_cast<int>(window.x().end());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100750
Omar Al Khatib605a9282022-11-01 17:01:24 +0000751 execute_window_loop(
752 win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100753 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100754 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
755 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000756 const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100757
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100758 // Compute window_step_x elements per iteration
759 int x = window_start_x;
760 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100761 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100762 const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
763 const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100764
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100765 uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
766 const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
767 uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
768 const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
769
770 tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
771 tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
772
773 if(is_scale255)
774 {
775 tmp1_high = scale255_U16_U16(tmp1_high);
776 tmp1_low = scale255_U16_U16(tmp1_low);
777 }
778 else
779 {
780 const int16x8_t vn = vdupq_n_s16(-n);
781
782 if(is_sat)
783 {
784 tmp1_high = vqshlq_u16(tmp1_high, vn);
785 tmp1_low = vqshlq_u16(tmp1_low, vn);
786 }
787 else
788 {
789 tmp1_high = vshlq_u16(tmp1_high, vn);
790 tmp1_low = vshlq_u16(tmp1_low, vn);
791 }
792 }
793 if(is_sat)
794 {
Viet-Hoa Do0d05b662022-09-09 15:39:05 +0100795 vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100796 }
797 else
798 {
Viet-Hoa Do0d05b662022-09-09 15:39:05 +0100799 vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100800 }
801 }
802
803 // Compute left-over elements
804 for(; x < window_end_x; ++x)
805 {
806 uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
807
808 if(is_scale255)
809 {
810 float tmp_f = static_cast<float>(tmp) * scale255_constant;
811 tmp = static_cast<uint16_t>(tmp_f + 0.5f);
812 }
813 else
814 {
815 tmp >>= n;
816 }
817 if(is_sat && tmp > 255)
818 {
819 tmp = 255;
820 }
821 *(output_ptr + x) = static_cast<uint8_t>(tmp);
822 }
823 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000824 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100825}
826
827template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000828inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100829{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000830 int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1));
831 const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));
832 int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1));
833 const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100834
835 tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
836 tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
837
838 if(is_scale255)
839 {
840 tmp1_high = scale255_S32_S32(tmp1_high);
841 tmp1_low = scale255_S32_S32(tmp1_low);
842 }
843 else
844 {
845 // Right shift amount
846 const int32x4_t vn = vdupq_n_s32(-n);
847 // Left shift amount
848 const int32x4_t vnl = vdupq_n_s32(n);
849 // Calculate conversion bit
850 const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
851 const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
852 const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
853 const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
854 const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
855 const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
856 const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
857 const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
858 if(is_sat)
859 {
860 tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
861 tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
862 }
863 else
864 {
865 tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
866 tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
867 }
868 }
869
870 if(is_sat)
871 {
872 return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
873 }
874 else
875 {
876 return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
877 }
878}
879
880template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000881inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100882{
883 const int16x8x2_t result =
884 {
885 {
886 // First 8 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000887 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100888 // Second 8 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000889 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100890 }
891 };
892
893 return result;
894}
895
896template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000897void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100898{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100899 // Create input windows
900 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000901 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
902 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100903
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100904 // Clear X Dimension on execution window as we handle manually
905 win.set(Window::DimX, Window::Dimension(0, 1, 1));
906 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
907 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100908
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000909 Iterator input1(src1, input1_win);
910 Iterator input2(src2, input2_win);
911 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100912
913 const int window_step_x = 16;
914 const auto window_start_x = static_cast<int>(window.x().start());
915 const auto window_end_x = static_cast<int>(window.x().end());
916
Omar Al Khatib605a9282022-11-01 17:01:24 +0000917 execute_window_loop(
918 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100919 {
920 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
921 const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000922 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100923
924 // Compute window_step_x elements per iteration
925 int x = window_start_x;
926 for(; x <= (window_end_x - window_step_x); x += window_step_x)
927 {
928 const int16x8x2_t ta1 =
929 {
930 {
931 vld1q_s16(input1_ptr + x),
932 vld1q_s16(input1_ptr + x + 8),
933 }
934 };
935 const int16x8x2_t ta2 =
936 {
937 {
938 vld1q_s16(input2_ptr + x),
939 vld1q_s16(input2_ptr + x + 8),
940 }
941 };
942 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
943
944 vst1q_s16(output_ptr + x, result.val[0]);
945 vst1q_s16(output_ptr + x + 8, result.val[1]);
946 }
947
948 // Compute left-over elements
949 for(; x < window_end_x; ++x)
950 {
951 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
952
953 if(is_scale255)
954 {
955 float tmp_f = static_cast<float>(tmp) * scale255_constant;
956
957 tmp = static_cast<int32_t>(tmp_f + 0.5f);
958 }
959 else
960 {
961 if(tmp >= 0)
962 {
963 tmp >>= n;
964 }
965 else
966 {
967 uint32_t mask = (1u << n) - 1;
968 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
969 }
970 }
971 if(is_sat)
972 {
973 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
974 }
975 *(output_ptr + x) = static_cast<int16_t>(tmp);
976 }
977 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000978 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100979}
980
Omar Al Khatib605a9282022-11-01 17:01:24 +0000981template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000982inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n)
SiCong Libb88f892020-08-28 11:18:47 +0100983{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000984 const int32x2_t input1_1 = vget_low_s32(src1);
985 const int32x2_t input2_1 = vget_low_s32(src2);
986 const int32x2_t input1_2 = vget_high_s32(src1);
987 const int32x2_t input2_2 = vget_high_s32(src2);
SiCong Libb88f892020-08-28 11:18:47 +0100988
989 int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
990 int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
991
992 // Apply scaling, conversion and rounding (round to zero)
993 // Right shift amount
994 const int64x2_t vn = vdupq_n_s64(-n);
995 // Left shift amount
996 const int64x2_t vnl = vdupq_n_s64(n);
997 // Calculate conversion bit
998 const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1);
999 const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63);
1000 const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1);
1001 const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
1002
1003 const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2);
1004 const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);
1005 const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);
1006 const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
1007 if(is_sat)
1008 {
1009 tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
1010 tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
1011 return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
1012 }
1013 else
1014 {
1015 tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
1016 tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
1017 return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
1018 }
1019}
1020
Omar Al Khatib605a9282022-11-01 17:01:24 +00001021template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001022inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
SiCong Libb88f892020-08-28 11:18:47 +01001023{
1024 const int32x4x2_t result =
1025 {
1026 {
1027 // First 4 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001028 mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
SiCong Libb88f892020-08-28 11:18:47 +01001029 // Second 4 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001030 mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)
SiCong Libb88f892020-08-28 11:18:47 +01001031 }
1032 };
1033
1034 return result;
1035}
1036
1037template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001038void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
SiCong Libb88f892020-08-28 11:18:47 +01001039{
1040 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001041 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1042 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
SiCong Libb88f892020-08-28 11:18:47 +01001043
1044 // Clear X Dimension on execution window as we handle manually
SiCong Lid6d1b362020-09-24 17:34:23 +01001045 Window win = window;
SiCong Libb88f892020-08-28 11:18:47 +01001046 win.set(Window::DimX, Window::Dimension(0, 1, 1));
SiCong Libb88f892020-08-28 11:18:47 +01001047
SiCong Lid6d1b362020-09-24 17:34:23 +01001048 const int window_step_x = 8;
1049 const auto window_start_x = static_cast<int>(window.x().start());
1050 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001051 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
SiCong Libb88f892020-08-28 11:18:47 +01001052
SiCong Lid6d1b362020-09-24 17:34:23 +01001053 if(is_broadcast_across_x)
SiCong Libb88f892020-08-28 11:18:47 +01001054 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001055 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1056 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1057 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001058 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1059 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
SiCong Libb88f892020-08-28 11:18:47 +01001060
SiCong Lid6d1b362020-09-24 17:34:23 +01001061 // Clear X Dimension on execution window as we handle manually
1062 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1063
1064 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1065 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001066 Iterator dst(out, win);
SiCong Lid6d1b362020-09-24 17:34:23 +01001067
Omar Al Khatib605a9282022-11-01 17:01:24 +00001068 execute_window_loop(
1069 win, [&](const Coordinates &)
SiCong Libb88f892020-08-28 11:18:47 +01001070 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001071 const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001072 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
SiCong Libb88f892020-08-28 11:18:47 +01001073
SiCong Lid6d1b362020-09-24 17:34:23 +01001074 const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
1075 const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);
SiCong Libb88f892020-08-28 11:18:47 +01001076
SiCong Lid6d1b362020-09-24 17:34:23 +01001077 // Compute window_step_x elements per iteration
1078 int x = window_start_x;
1079 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1080 {
1081 const int32x4x2_t broadcast_v =
1082 {
1083 {
1084 broadcast_value_vec,
1085 broadcast_value_vec,
1086 }
1087 };
1088 const int32x4x2_t non_broadcast_v =
1089 {
1090 {
1091 vld1q_s32(non_broadcast_input_ptr + x),
1092 vld1q_s32(non_broadcast_input_ptr + x + 4),
1093 }
1094 };
1095 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
1096
1097 vst1q_s32(output_ptr + x, result.val[0]);
1098 vst1q_s32(output_ptr + x + 4, result.val[1]);
1099 }
1100
1101 // Compute left-over elements
1102 for(; x < window_end_x; ++x)
1103 {
1104 int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
1105
1106 if(tmp >= 0)
1107 {
1108 tmp >>= n;
1109 }
1110 else
1111 {
Michalis Spyrou7f8caf72021-05-13 13:35:30 +01001112 uint64_t mask = ((uint64_t)1u << n) - 1;
SiCong Lid6d1b362020-09-24 17:34:23 +01001113 tmp = (tmp + static_cast<int64_t>(mask)) >> n;
1114 }
1115 if(is_sat)
1116 {
1117 tmp = utility::clamp<int64_t, int32_t>(tmp);
1118 }
1119 *(output_ptr + x) = static_cast<int32_t>(tmp);
1120 }
1121 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001122 broadcast_input, non_broadcast_input, dst);
SiCong Lid6d1b362020-09-24 17:34:23 +01001123 }
1124 else
1125 {
1126 // Clear X Dimension on execution window as we handle manually
1127 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1128 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1129
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001130 Iterator input1(src1, input1_win);
1131 Iterator input2(src2, input2_win);
1132 Iterator dst(out, win);
SiCong Lid6d1b362020-09-24 17:34:23 +01001133
Omar Al Khatib605a9282022-11-01 17:01:24 +00001134 execute_window_loop(
1135 win, [&](const Coordinates &)
SiCong Libb88f892020-08-28 11:18:47 +01001136 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001137 const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
1138 const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001139 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
SiCong Libb88f892020-08-28 11:18:47 +01001140
SiCong Lid6d1b362020-09-24 17:34:23 +01001141 // Compute window_step_x elements per iteration
1142 int x = window_start_x;
1143 for(; x <= (window_end_x - window_step_x); x += window_step_x)
SiCong Libb88f892020-08-28 11:18:47 +01001144 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001145 const int32x4x2_t ta1 =
1146 {
1147 {
1148 vld1q_s32(input1_ptr + x),
1149 vld1q_s32(input1_ptr + x + 4),
1150 }
1151 };
1152 const int32x4x2_t ta2 =
1153 {
1154 {
1155 vld1q_s32(input2_ptr + x),
1156 vld1q_s32(input2_ptr + x + 4),
1157 }
1158 };
1159 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
1160
1161 vst1q_s32(output_ptr + x, result.val[0]);
1162 vst1q_s32(output_ptr + x + 4, result.val[1]);
SiCong Libb88f892020-08-28 11:18:47 +01001163 }
SiCong Lid6d1b362020-09-24 17:34:23 +01001164
1165 // Compute left-over elements
1166 for(; x < window_end_x; ++x)
SiCong Libb88f892020-08-28 11:18:47 +01001167 {
SiCong Lid6d1b362020-09-24 17:34:23 +01001168 int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
1169
1170 if(tmp >= 0)
1171 {
1172 tmp >>= n;
1173 }
1174 else
1175 {
Michalis Spyrou7f8caf72021-05-13 13:35:30 +01001176 uint64_t mask = ((uint64_t)1u << n) - 1;
SiCong Lid6d1b362020-09-24 17:34:23 +01001177 tmp = (tmp + static_cast<int64_t>(mask)) >> n;
1178 }
1179 if(is_sat)
1180 {
1181 tmp = utility::clamp<int64_t, int32_t>(tmp);
1182 }
1183 *(output_ptr + x) = static_cast<int32_t>(tmp);
SiCong Libb88f892020-08-28 11:18:47 +01001184 }
SiCong Lid6d1b362020-09-24 17:34:23 +01001185 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001186 input1, input2, dst);
SiCong Lid6d1b362020-09-24 17:34:23 +01001187 }
SiCong Libb88f892020-08-28 11:18:47 +01001188}
1189
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001190void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001191{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001192 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001193 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1194 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001195
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001196 // Clear X Dimension on execution window as we handle manually
1197 Window win = window;
1198 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1199
1200 constexpr int window_step_x = 16 / sizeof(float);
1201 const auto window_start_x = static_cast<int>(window.x().start());
1202 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001203 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001204
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001205 using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
1206
1207 if(is_broadcast_across_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001208 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001209 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1210 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1211 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001212 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1213 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001214
1215 // Clear X Dimension on execution window as we handle manually
1216 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1217
1218 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1219 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001220 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001221
Omar Al Khatib605a9282022-11-01 17:01:24 +00001222 execute_window_loop(
1223 win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001224 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001225 const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001226 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001227
1228 const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
1229 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
1230 const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
1231
1232 // Compute window_step_x elements per iteration
1233 int x = window_start_x;
1234 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1235 {
1236 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
1237 auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
1238 wrapper::vstore(output_ptr + x, res);
1239 }
1240
1241 // Compute left-over elements
1242 for(; x < window_end_x; ++x)
1243 {
1244 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
1245 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
1246 }
1247 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001248 broadcast_input, non_broadcast_input, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001249 }
1250 else
1251 {
1252 // Clear X Dimension on execution window as we handle manually
1253 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1254 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1255
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001256 Iterator input1(src1, input1_win);
1257 Iterator input2(src2, input2_win);
1258 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001259
Omar Al Khatib605a9282022-11-01 17:01:24 +00001260 execute_window_loop(
1261 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001262 {
1263 const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
1264 const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001265 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001266
1267 // Compute window_step_x elements per iteration
1268 int x = window_start_x;
1269 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1270 {
1271 const auto ta1 = wrapper::vloadq(input1_ptr + x);
1272 const auto ta2 = wrapper::vloadq(input2_ptr + x);
1273 const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
1274 const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
1275 wrapper::vstore(output_ptr + x, res);
1276 }
1277
1278 // Compute left-over elements
1279 for(; x < window_end_x; ++x)
1280 {
1281 const auto ta1 = *(input1_ptr + x);
1282 const auto ta2 = *(input2_ptr + x);
1283 *(output_ptr + x) = ta1 * ta2 * scale;
1284 }
1285 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001286 input1, input2, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001287 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001288}
1289
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001290void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window)
giuros01154bc1c2019-03-26 17:44:40 +00001291{
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001292 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001293 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1294 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00001295
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001296 // Clear X Dimension on execution window as we handle manually
1297 Window win = window;
1298 win.set(Window::DimX, Window::Dimension(0, 1, 1));
giuros01154bc1c2019-03-26 17:44:40 +00001299
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001300 constexpr int window_step_x = 8 / sizeof(float);
1301 const auto window_start_x = static_cast<int>(window.x().start());
1302 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001303 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
giuros01154bc1c2019-03-26 17:44:40 +00001304
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001305 using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
1306
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001307 if(is_broadcast_across_x)
1308 {
1309 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1310 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1311 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001312 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1313 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
giuros01154bc1c2019-03-26 17:44:40 +00001314
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001315 // Clear X Dimension on execution window as we handle manually
1316 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
giuros01154bc1c2019-03-26 17:44:40 +00001317
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001318 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1319 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001320 Iterator dst(out, win);
giuros01154bc1c2019-03-26 17:44:40 +00001321
Omar Al Khatib605a9282022-11-01 17:01:24 +00001322 execute_window_loop(
1323 win, [&](const Coordinates &)
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001324 {
1325 const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001326 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
giuros01154bc1c2019-03-26 17:44:40 +00001327
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001328 const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
1329
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001330 // Compute window_step_x elements per iteration
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001331 int x = window_start_x;
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001332 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1333 {
1334 const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
1335 float32x4_t b = vdupq_n_f32(broadcast_value);
1336
1337 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1338 const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
1339 const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
1340 const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
1341 const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
1342
1343 const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
1344 const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
1345
1346 float32x4_t res = wrapper::vmul(tmp0, b);
1347 b = wrapper::vmul(b, mask);
1348
1349 res = wrapper::vmla(res, tmp1, b);
1350 wrapper::vstore(output_ptr + 2 * x, res);
1351 }
1352
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001353 // Compute left-over elements
1354 for(; x < window_end_x; ++x)
1355 {
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001356 const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
1357 const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
1358 auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
1359 auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
1360 *(output_ptr + 2 * x) = res1;
1361 *(output_ptr + 2 * x + 1) = res2;
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001362 }
1363 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001364 broadcast_input, non_broadcast_input, dst);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001365 }
1366 else
1367 {
1368 // Clear X Dimension on execution window as we handle manually
1369 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1370 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1371
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001372 Iterator input1(src1, input1_win);
1373 Iterator input2(src2, input2_win);
1374 Iterator dst(out, win);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001375
Omar Al Khatib605a9282022-11-01 17:01:24 +00001376 execute_window_loop(
1377 win, [&](const Coordinates &)
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001378 {
1379 const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
1380 const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001381 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001382
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001383 // Compute window_step_x elements per iteration
1384 int x = window_start_x;
1385 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1386 {
1387 const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
1388 float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x);
1389
1390 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1391 const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
1392 const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
1393 const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
1394 const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
1395
1396 const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
1397 const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
1398
1399 float32x4_t res = wrapper::vmul(tmp0, b);
1400
1401 b = wrapper::vrev64(b);
1402 b = wrapper::vmul(b, mask);
1403
1404 res = wrapper::vmla(res, tmp1, b);
1405 wrapper::vstore(output_ptr + 2 * x, res);
1406 }
1407
1408 // Compute left-over elements
1409 for(; x < window_end_x; ++x)
1410 {
1411 const auto a0 = *(input1_ptr + 2 * x);
1412 const auto a1 = *(input1_ptr + 2 * x + 1);
1413 const auto b0 = *(input2_ptr + 2 * x);
1414 const auto b1 = *(input2_ptr + 2 * x + 1);
1415 auto res1 = a0 * b0 - a1 * b1;
1416 auto res2 = a0 * b1 + a1 * b0;
1417 *(output_ptr + 2 * x) = res1;
1418 *(output_ptr + 2 * x + 1) = res2;
1419 }
1420 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001421 input1, input2, dst);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001422 }
giuros01154bc1c2019-03-26 17:44:40 +00001423}
1424
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001425#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001426void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001427{
1428 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001429 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1430 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001431
1432 // Clear X Dimension on execution window as we handle manually
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001433 Window win = window;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001434 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001435 constexpr int window_step_x = 16;
1436 const auto window_start_x = static_cast<int>(window.x().start());
1437 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001438 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001439 if(is_broadcast_across_x)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001440 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001441 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1442 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1443 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001444 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1445 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001446 // Clear X Dimension on execution window as we handle manually
1447 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1448 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1449 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001450 Iterator dst(out, win);
Omar Al Khatib605a9282022-11-01 17:01:24 +00001451 execute_window_loop(
1452 win, [&](const Coordinates &)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001453 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001454 const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001455 const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001456 const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
Omar Al Khatib605a9282022-11-01 17:01:24 +00001457 const float16x8x2_t broadcast_value_vec =
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001458 {
1459 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001460 vdupq_n_f16(broadcast_value),
1461 vdupq_n_f16(broadcast_value),
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001462 }
1463 };
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001464 const auto scale_vec = vdupq_n_f16(scale);
1465 // Compute window_step_x elements per iteration
1466 int x = window_start_x;
1467 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001468 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001469 const float16x8x2_t non_broadcast_v =
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001470 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001471 {
1472 vld1q_f16(non_broadcast_input_ptr + x),
1473 vld1q_f16(non_broadcast_input_ptr + x + 8),
1474 }
1475 };
1476 const float16x8x2_t result =
1477 {
1478 {
1479 vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
1480 vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
1481 }
1482 };
1483 vst1q_f16(output_ptr + x, result.val[0]);
1484 vst1q_f16(output_ptr + x + 8, result.val[1]);
1485 }
1486 // Compute left-over elements
1487 for(; x < window_end_x; ++x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001488 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001489 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
1490 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
1491 }
1492 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001493 broadcast_input, non_broadcast_input, dst);
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001494 }
1495 else
1496 {
1497 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1498 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001499 Iterator input1(src1, input1_win);
1500 Iterator input2(src2, input2_win);
1501 Iterator dst(out, win);
Omar Al Khatib605a9282022-11-01 17:01:24 +00001502 execute_window_loop(
1503 win, [&](const Coordinates &)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001504 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001505 const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
1506 const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001507 const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001508 // Compute window_step_x elements per iteration
1509 int x = window_start_x;
1510 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1511 {
1512 const float16x8x2_t ta1 =
1513 {
1514 {
1515 vld1q_f16(input1_ptr + x),
1516 vld1q_f16(input1_ptr + x + 8),
1517 }
1518 };
1519 const float16x8x2_t ta2 =
1520 {
1521 {
1522 vld1q_f16(input2_ptr + x),
1523 vld1q_f16(input2_ptr + x + 8),
1524 }
1525 };
1526 const float16x8_t scale_vec = vdupq_n_f16(scale);
Omar Al Khatib605a9282022-11-01 17:01:24 +00001527 const float16x8x2_t result =
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001528 {
1529 {
1530 vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
1531 vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
1532 }
1533 };
1534 vst1q_f16(output_ptr + x, result.val[0]);
1535 vst1q_f16(output_ptr + x + 8, result.val[1]);
1536 }
1537 // Compute left-over elements
1538 for(; x < window_end_x; ++x)
1539 {
1540 const auto ta1 = *(input1_ptr + x);
1541 const auto ta2 = *(input2_ptr + x);
1542 *(output_ptr + x) = ta1 * ta2 * scale;
1543 }
1544 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001545 input1, input2, dst);
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001546 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001547}
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001548#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellodf246182017-07-03 16:25:09 +01001549
1550template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001551void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001552{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001553 // Create input windows
1554 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001555 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1556 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001557
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001558 // Clear X Dimension on execution window as we handle manually
1559 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1560 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1561 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001562
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001563 Iterator input1(src1, input1_win);
1564 Iterator input2(src2, input2_win);
1565 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001566
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001567 const int window_step_x = 16 / sizeof(uint8_t);
1568 const auto window_start_x = static_cast<int>(window.x().start());
1569 const auto window_end_x = static_cast<int>(window.x().end());
1570
Omar Al Khatib605a9282022-11-01 17:01:24 +00001571 execute_window_loop(
1572 win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001573 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001574 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
1575 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001576 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001577
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001578 // Compute window_step_x elements per iteration
1579 int x = window_start_x;
1580 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001581 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001582 const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
1583 const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
1584
1585 uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
1586 uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
1587 tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
1588 tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
1589
1590 if(is_scale255)
1591 {
1592 tmp_low = scale255_U16_U16(tmp_low);
1593 tmp_high = scale255_U16_U16(tmp_high);
1594 }
1595 else
1596 {
1597 const int16x8_t vn = vdupq_n_s16(-n);
1598
1599 if(is_sat)
1600 {
1601 tmp_low = vqshlq_u16(tmp_low, vn);
1602 tmp_high = vqshlq_u16(tmp_high, vn);
1603 }
1604 else
1605 {
1606 tmp_low = vshlq_u16(tmp_low, vn);
1607 tmp_high = vshlq_u16(tmp_high, vn);
1608 }
1609 }
1610
1611 if(is_sat)
1612 {
1613 static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
1614
1615 tmp_low = vminq_u16(tmp_low, max);
1616 tmp_high = vminq_u16(tmp_high, max);
1617 }
1618
1619 vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
1620 vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001621 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001622
1623 // Compute left-over elements
1624 for(; x < window_end_x; ++x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001625 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001626 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1627
1628 if(is_scale255)
1629 {
1630 float tmp_f = static_cast<float>(tmp) * scale255_constant;
1631 tmp = static_cast<int32_t>(tmp_f + 0.5f);
1632 }
1633 else
1634 {
1635 tmp >>= n;
1636 }
1637
1638 if(is_sat)
1639 {
1640 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
1641 }
1642
1643 *(output_ptr + x) = static_cast<int16_t>(tmp);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001644 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001645 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001646 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001647}
1648
1649template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001650void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001651{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001652 // Create input windows
1653 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001654 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1655 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001656
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001657 // Clear X Dimension on execution window as we handle manually
1658 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1659 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1660 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001661
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001662 Iterator input1(src1, input1_win);
1663 Iterator input2(src2, input2_win);
1664 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001665
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001666 const int window_step_x = 16;
1667 const auto window_start_x = static_cast<int>(window.x().start());
1668 const auto window_end_x = static_cast<int>(window.x().end());
1669
Omar Al Khatib605a9282022-11-01 17:01:24 +00001670 execute_window_loop(
1671 win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001672 {
1673 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
1674 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001675 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001676
1677 // Compute window_step_x elements per iteration
1678 int x = window_start_x;
1679 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1680 {
1681 const int16x8x2_t ta1 =
1682 {
1683 {
1684 vld1q_s16(input1_ptr + x),
1685 vld1q_s16(input1_ptr + x + 8),
1686 }
1687 };
1688 const uint8x8x2_t ta2u =
1689 {
1690 {
1691 vld1_u8(input2_ptr + x),
1692 vld1_u8(input2_ptr + x + 8),
1693 }
1694 };
1695 const int16x8x2_t ta2 =
1696 {
1697 {
1698 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
1699 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
1700 }
1701 };
1702
1703 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
1704
1705 vst1q_s16(output_ptr + x, result.val[0]);
1706 vst1q_s16(output_ptr + x + 8, result.val[1]);
1707 }
1708
1709 // Compute left-over elements
1710 for(; x < window_end_x; ++x)
1711 {
1712 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1713
1714 if(is_scale255)
1715 {
1716 float tmp_f = static_cast<float>(tmp) * scale255_constant;
1717
1718 tmp = static_cast<int32_t>(tmp_f + 0.5f);
1719 }
1720 else
1721 {
1722 if(tmp >= 0)
1723 {
1724 tmp >>= n;
1725 }
1726 else
1727 {
1728 uint32_t mask = (1u << n) - 1;
1729 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
1730 }
1731 }
1732 if(is_sat)
1733 {
1734 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
1735 }
1736 *(output_ptr + x) = static_cast<int16_t>(tmp);
1737 }
1738 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001739 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001740}
1741
1742template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001743void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001744{
1745 // Simply swap the two input buffers
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001746 mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001747}
1748} // namespace
1749
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001750void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001751{
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001752 ARM_COMPUTE_UNUSED(rounding_policy);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001753 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
Georgios Pinitasf0dea702017-07-03 18:17:28 +01001754
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001755 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001756
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001757 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001758
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001759 // Auto initialize dst if not initialized
1760 set_shape_if_empty(*dst, out_shape);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001761
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001762 _scale = scale;
1763 _scale_exponent = 0;
1764 _func_quantized = nullptr;
1765 _func_int = nullptr;
1766 _func_float = nullptr;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001767
1768 bool is_scale_255 = false;
1769 // Check and validate scaling factor
1770 if(std::abs(scale - scale255_constant) < 0.00001f)
1771 {
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001772 is_scale_255 = true;
1773 }
1774 else
1775 {
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001776 int exponent = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001777
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001778 std::frexp(scale, &exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001779
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001780 // Store the positive exponent. We know that we compute 1/2^n
1781 // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
1782 _scale_exponent = std::abs(exponent - 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001783 }
1784
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001785 const DataType dt_input1 = src1->data_type();
1786 const DataType dt_input2 = src2->data_type();
1787 const DataType dt_output = dst->data_type();
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001788 const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
1789
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001790 switch(dt_input1)
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001791 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001792 case DataType::QASYMM8:
1793 if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
1794 {
Omar Al Khatib605a9282022-11-01 17:01:24 +00001795 if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
1796 {
1797 _func_quantized = &mul_q8_neon_fixedpoint<uint8_t>;
1798 }
1799 else
1800 {
1801 _func_quantized = &mul_saturate_quantized_8<uint8_t>;
1802 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001803 }
1804 break;
1805 case DataType::QASYMM8_SIGNED:
1806 if(dt_input2 == DataType::QASYMM8_SIGNED)
1807 {
Omar Al Khatib605a9282022-11-01 17:01:24 +00001808 if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
1809 {
1810 _func_quantized = &mul_q8_neon_fixedpoint<int8_t>;
1811 }
1812 else
1813 {
1814 _func_quantized = &mul_saturate_quantized_8<int8_t>;
1815 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001816 }
1817 break;
1818 case DataType::QSYMM16:
1819 if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
1820 {
1821 _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
1822 }
1823 else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
1824 {
1825 _func_int = &mul_QSYMM16_QSYMM16_S32;
1826 }
1827 break;
1828 case DataType::S16:
1829 if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1830 {
1831 if(is_scale_255)
1832 {
1833 _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
1834 }
1835 else
1836 {
1837 _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
1838 }
1839 }
1840 if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1841 {
1842 if(is_scale_255)
1843 {
1844 _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
1845 }
1846 else
1847 {
1848 _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
1849 }
1850 }
1851 break;
SiCong Libb88f892020-08-28 11:18:47 +01001852 case DataType::S32:
1853 if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
1854 {
1855 _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
1856 }
1857 break;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001858 case DataType::U8:
1859 if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
1860 {
1861 if(is_scale_255)
1862 {
1863 _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
1864 }
1865 else
1866 {
1867 _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
1868 }
1869 }
1870 else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1871 {
1872 if(is_scale_255)
1873 {
1874 _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
1875 }
1876 else
1877 {
1878 _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
1879 }
1880 }
1881 else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1882 {
1883 if(is_scale_255)
1884 {
1885 _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
1886 }
1887 else
1888 {
1889 _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
1890 }
1891 }
1892 break;
1893#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1894 case DataType::F16:
1895 _func_float = &mul_F16_F16_F16;
1896 break;
1897#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1898 case DataType::F32:
1899 _func_float = &mul_F32_F32_F32;
1900 break;
1901 default:
1902 ARM_COMPUTE_ERROR("You called with the wrong img formats");
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001903 }
1904
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001905 // Configure kernel window
Viet-Hoa Do0d05b662022-09-09 15:39:05 +01001906 Window win;
1907 std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src1, *src2);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001908
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001909 ICpuKernel::configure(win);
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001910}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001911
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001912Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
1913 RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001914{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001915 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
1916 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001917
Georgios Pinitas631c41a2017-12-06 11:53:03 +00001918 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001919}
1920
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001921void CpuMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001922{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001923 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001924 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001925 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001926
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001927 auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
1928 auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
1929 auto dst = tensors.get_tensor(TensorType::ACL_DST);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001930
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001931 if(_func_quantized != nullptr)
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001932 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001933 (*_func_quantized)(src1, src2, dst, window, _scale);
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001934 }
1935 else if(_func_int != nullptr)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001936 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001937 (*_func_int)(src1, src2, dst, window, _scale_exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001938 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001939 else
1940 {
1941 ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001942 (*_func_float)(src1, src2, dst, window, _scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001943 }
1944}
Viet-Hoa Dod4a9cc02022-11-08 12:01:21 +00001945
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001946const char *CpuMulKernel::name() const
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001947{
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001948 return "CpuMulKernel";
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001949}
Viet-Hoa Dod4a9cc02022-11-08 12:01:21 +00001950
1951size_t CpuMulKernel::get_mws(const CPUInfo &platform, size_t thread_count) const
1952{
1953 ARM_COMPUTE_UNUSED(platform, thread_count);
1954
1955 if(_split_dimension == Window::DimX)
1956 {
1957 // Don't split the work load too small if the tensor has been reinterpreted as 1D.
1958 // This number is loosely chosen as threading overhead in each platform varies wildly.
1959 return 10240;
1960 }
1961
1962 return default_mws;
1963}
1964
giuros01154bc1c2019-03-26 17:44:40 +00001965namespace
1966{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001967Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00001968{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001969 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32);
1970 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32);
giuros01154bc1c2019-03-26 17:44:40 +00001971
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001972 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00001973
1974 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
1975
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001976 // Validate in case of configured dst
1977 if(dst->total_size() > 0)
giuros01154bc1c2019-03-26 17:44:40 +00001978 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001979 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);
1980 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
giuros01154bc1c2019-03-26 17:44:40 +00001981 }
1982
1983 return Status{};
1984}
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001985} // namespace
giuros01154bc1c2019-03-26 17:44:40 +00001986
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01001987void CpuComplexMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00001988{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001989 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
1990 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst));
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001991
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001992 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00001993
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001994 // Auto initialize dst if not initialized
1995 const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());
1996 auto_init_if_empty(*dst, out_info);
giuros01154bc1c2019-03-26 17:44:40 +00001997
giuros01154bc1c2019-03-26 17:44:40 +00001998 // Configure kernel window
SiCongLic7b1e842021-02-22 14:28:33 +00001999 Window win = calculate_max_window(out_shape);
giuros01154bc1c2019-03-26 17:44:40 +00002000
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002001 ICpuKernel::configure(win);
giuros01154bc1c2019-03-26 17:44:40 +00002002}
2003
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002004Status CpuComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00002005{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002006 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
2007 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst));
giuros01154bc1c2019-03-26 17:44:40 +00002008
2009 return Status{};
2010}
2011
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002012void CpuComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
giuros01154bc1c2019-03-26 17:44:40 +00002013{
2014 ARM_COMPUTE_UNUSED(info);
2015 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002016 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
giuros01154bc1c2019-03-26 17:44:40 +00002017
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002018 auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
2019 auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
2020 auto dst = tensors.get_tensor(TensorType::ACL_DST);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01002021
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002022 c_mul_F32_F32_F32_n(src1, src2, dst, window);
giuros01154bc1c2019-03-26 17:44:40 +00002023}
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002024
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002025const char *CpuComplexMulKernel::name() const
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002026{
Georgios Pinitas0dc0d8e2021-04-30 03:18:37 +01002027 return "CpuComplexMulKernel";
Sheri Zhang1e3ab422021-03-16 17:35:08 +00002028}
2029} // namespace kernels
2030} // namespace cpu
Manuel Bottini79fa9a22019-02-22 17:54:22 +00002031} // namespace arm_compute