blob: 91b7552ecf1bac137ce058c46c8522e1d3d749c6 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
SiCongLic7b1e842021-02-22 14:28:33 +00002 * Copyright (c) 2016-2021 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Sheri Zhang1e3ab422021-03-16 17:35:08 +000024#include "src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
Sheri Zhang1e3ab422021-03-16 17:35:08 +000026#include "arm_compute/core/ITensor.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#include "arm_compute/core/TensorInfo.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010028#include "src/core/CPP/Validate.h"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010029#include "src/core/NEON/NEAsymm.h"
30#include "src/core/NEON/NESymm.h"
31#include "src/core/NEON/wrapper/wrapper.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010032#include "src/core/helpers/AutoConfiguration.h"
33#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010034
35#include <arm_neon.h>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010036
Anthony Barbier6ff3b192017-09-04 18:44:23 +010037namespace arm_compute
38{
Sheri Zhang1e3ab422021-03-16 17:35:08 +000039namespace cpu
40{
41namespace kernels
42{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010043namespace
44{
45const float scale255_constant = 1.f / 255.f;
46const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
47const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
48
Sheri Zhang1e3ab422021-03-16 17:35:08 +000049inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000050{
51 ARM_COMPUTE_UNUSED(overflow_policy);
52 ARM_COMPUTE_UNUSED(rounding_policy);
53
Sheri Zhang1e3ab422021-03-16 17:35:08 +000054 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);
55 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
SiCong Libb88f892020-08-28 11:18:47 +010056 DataType::F32);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000057 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
SiCong Libb88f892020-08-28 11:18:47 +010058 DataType::F32);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000059 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
Michele Di Giorgio9428a182020-03-30 14:10:20 +010060 DataType::S16, DataType::QSYMM16,
61 DataType::S32, DataType::F16, DataType::F32);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000062 if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
Pablo Tello52ea9c22019-12-10 11:28:53 +000063 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +000064 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
Georgios Pinitasd7d7e902019-12-18 15:40:54 +000065 ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
Pablo Tello52ea9c22019-12-10 11:28:53 +000066 }
Manuel Bottini79fa9a22019-02-22 17:54:22 +000067
Sheri Zhang1e3ab422021-03-16 17:35:08 +000068 if(dst->total_size() > 0)
Manuel Bottini79fa9a22019-02-22 17:54:22 +000069 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +000070 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
71 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
Manuel Bottini79fa9a22019-02-22 17:54:22 +000072 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
SiCong Libb88f892020-08-28 11:18:47 +010073 // clang-format off
74 ARM_COMPUTE_RETURN_ERROR_ON_MSG(
Sheri Zhang1e3ab422021-03-16 17:35:08 +000075 !(src1->data_type() == src2->data_type() && src2->data_type() == dst->data_type()) &&
76 !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
77 !(src1->data_type() == DataType::U8 && src2->data_type() == DataType::S16 && dst->data_type() == DataType::S16) &&
78 !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
79 !(src1->data_type() == DataType::S16 && src2->data_type() == DataType::U8 && dst->data_type() == DataType::S16) &&
80 !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
SiCong Libb88f892020-08-28 11:18:47 +010081 , "Invalid data type combination");
82 // clang-format on
Sheri Zhang1e3ab422021-03-16 17:35:08 +000083 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");
Manuel Bottini79fa9a22019-02-22 17:54:22 +000084 }
Michalis Spyrou861f0db2018-02-26 16:47:58 +000085
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000086 if(std::abs(scale - scale255_constant) < 0.00001f)
87 {
88 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
Sheri Zhang1e3ab422021-03-16 17:35:08 +000089 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,
90 "Scale == 1/255 is not supported if input and dst are of data type S32");
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000091 }
92 else
93 {
94 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
95
96 int exponent = 0;
97 const float normalized_mantissa = std::frexp(scale, &exponent);
98
99 // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
100 // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
101 // Moreover, it will be negative as we deal with 1/2^n
102 ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
103 }
104
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000105 return Status{};
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +0000106}
107
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100108/* Scales a given vector by 1/255.
109 *
110 * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
111 *
112 * @param in Input vector to scale.
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000113 * @return Scaled dst rounded to nearest (round half up).
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100114 */
115inline int32x4_t scale255_S32_S32(int32x4_t in)
116{
117 // Scale
118 const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
119 // Round to nearest (round half up)
120 // Add +0.5 for all values
121 // Afterwards vcvt rounds toward zero
122 return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
123}
124
125inline uint16x8_t scale255_U16_U16(uint16x8_t in)
126{
127 const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
128 const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
129 return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
130}
131
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100132template <typename T>
133inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type
134vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000135{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100136 return vquantize_signed(val, info);
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000137}
138
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100139template <typename T>
140inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type
141vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Pablo Tello52ea9c22019-12-10 11:28:53 +0000142{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100143 return vquantize(val, info);
Pablo Tello52ea9c22019-12-10 11:28:53 +0000144}
145
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100146template <typename T>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000147void mul_saturate_quantized_8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100148{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100149 // Create input windows
150 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000151 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
152 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100153
154 // Clear X Dimension on execution window as we handle manually
155 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100156
Sheri Zhanga449a362020-07-16 15:52:25 +0100157 const int window_step_x = 16 / sizeof(T);
158 const auto window_start_x = static_cast<int>(window.x().start());
159 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000160 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100161
Sheri Zhanga449a362020-07-16 15:52:25 +0100162 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
163 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100164
Sheri Zhanga449a362020-07-16 15:52:25 +0100165 if(is_broadcast_across_x)
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100166 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100167 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
168 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
169 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000170 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
171 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Sheri Zhanga449a362020-07-16 15:52:25 +0100172 const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
173 const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100174
Sheri Zhanga449a362020-07-16 15:52:25 +0100175 // Clear X Dimension on execution window as we handle manually
176 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
177
178 Iterator broadcast_input(broadcast_tensor, broadcast_win);
179 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000180 Iterator dst(out, win);
Sheri Zhanga449a362020-07-16 15:52:25 +0100181
182 using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
183
184 execute_window_loop(win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100185 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100186 const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000187 const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100188
Sheri Zhanga449a362020-07-16 15:52:25 +0100189 const auto broadcast_value = *reinterpret_cast<const T *>(broadcast_input.ptr());
190 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100191
Sheri Zhanga449a362020-07-16 15:52:25 +0100192 // Compute window_step_x elements per iteration
193 int x = window_start_x;
194 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100195 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100196 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100197
Sheri Zhanga449a362020-07-16 15:52:25 +0100198 // Dequantize inputs
199 const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
200 const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100201
Sheri Zhanga449a362020-07-16 15:52:25 +0100202 const float32x4x4_t out_f32x4x4 =
203 {
204 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
205 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
206 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
207 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
208 };
209
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000210 // Quantize dst
Sheri Zhanga449a362020-07-16 15:52:25 +0100211 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
212 wrapper::vstore(output_ptr + x, result);
213 }
214
215 // Compute left-over elements
216 for(; x < window_end_x; ++x)
217 {
218 // Dequantize inputs
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000219 const T src1 = *(non_broadcast_input_ptr + x);
220 const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100221 const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
222 const float tmp_f = tmp_in1 * tmp_in2;
Sheri Zhanga449a362020-07-16 15:52:25 +0100223
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000224 // Quantize dst
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100225 const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
Sheri Zhanga449a362020-07-16 15:52:25 +0100226 *(output_ptr + x) = tmp_qua;
227 }
228 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000229 broadcast_input, non_broadcast_input, dst);
Sheri Zhanga449a362020-07-16 15:52:25 +0100230 }
231 else
232 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000233 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
234 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
Sheri Zhanga449a362020-07-16 15:52:25 +0100235
236 // Clear X Dimension on execution window as we handle manually
237 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
238 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
239
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000240 Iterator input1(src1, input1_win);
241 Iterator input2(src2, input2_win);
242 Iterator dst(out, win);
Sheri Zhanga449a362020-07-16 15:52:25 +0100243
244 execute_window_loop(win, [&](const Coordinates &)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100245 {
Sheri Zhanga449a362020-07-16 15:52:25 +0100246 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
247 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000248 const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100249
Sheri Zhanga449a362020-07-16 15:52:25 +0100250 // Compute window_step_x elements per iteration
251 int x = window_start_x;
252 for(; x <= (window_end_x - window_step_x); x += window_step_x)
253 {
254 const auto input1_q = wrapper::vloadq(input1_ptr + x);
255 const auto input2_q = wrapper::vloadq(input2_ptr + x);
256
257 // Dequantize inputs
258 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
259 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
260
261 const float32x4x4_t out_f32x4x4 =
262 {
263 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
264 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
265 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
266 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
267 };
268
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000269 // Quantize dst
Sheri Zhanga449a362020-07-16 15:52:25 +0100270 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
271 wrapper::vstore(output_ptr + x, result);
272 }
273
274 // Compute left-over elements
275 for(; x < window_end_x; ++x)
276 {
277 // Dequantize inputs
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000278 const T src1 = *(input1_ptr + x);
279 const T src2 = *(input2_ptr + x);
280 const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
281 const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100282 const float tmp_f = tmp_in1 * tmp_in2;
Sheri Zhanga449a362020-07-16 15:52:25 +0100283
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000284 // Quantize dst
Michele Di Giorgio40aad9b2020-07-22 15:17:43 +0100285 const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
Sheri Zhanga449a362020-07-16 15:52:25 +0100286 *(output_ptr + x) = tmp_qua;
287 }
288 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000289 input1, input2, dst);
Sheri Zhanga449a362020-07-16 15:52:25 +0100290 }
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100291}
292
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000293void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100294{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000295 const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
296 const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100297 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
298
299 // Create input windows
300 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000301 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
302 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100303
304 // Clear X Dimension on execution window as we handle manually
305 win.set(Window::DimX, Window::Dimension(0, 1, 1));
306 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
307 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
308
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000309 Iterator input1(src1, input1_win);
310 Iterator input2(src2, input2_win);
311 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100312
313 const int window_step_x = 16;
314 const auto window_start_x = static_cast<int>(window.x().start());
315 const auto window_end_x = static_cast<int>(window.x().end());
316
317 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
318
319 execute_window_loop(win, [&](const Coordinates &)
320 {
321 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
322 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000323 const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100324
325 // Compute window_step_x elements per iteration
326 int x = window_start_x;
327 for(; x <= (window_end_x - window_step_x); x += window_step_x)
328 {
329 const qsymm16x8x2_t input1_q =
330 {
331 {
332 vld1q_s16(input1_ptr + x),
333 vld1q_s16(input1_ptr + x + 8),
334 }
335 };
336 const qsymm16x8x2_t input2_q =
337 {
338 {
339 vld1q_s16(input2_ptr + x),
340 vld1q_s16(input2_ptr + x + 8),
341 }
342 };
343
344 // Dequantize inputs
345 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
346 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
347
348 const float32x4x4_t out_f32x4x4 =
349 {
350 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
351 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
352 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
353 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
354 };
355
356 const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
357 vst1q_s16(output_ptr + x, result.val[0]);
358 vst1q_s16(output_ptr + x + 8, result.val[1]);
359 }
360
361 // Compute left-over elements
362 for(; x < window_end_x; ++x)
363 {
364 // Dequantize inputs
365 float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
366 float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
367 float tmp_f = tmp_in1 * tmp_in2;
368
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000369 // Quantize dst, lrintf() has same rounding mode as vcombine_s16
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100370 int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
371 qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
372 *(output_ptr + x) = tmp_qua;
373 }
374 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000375 input1, input2, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100376}
377
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000378void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100379{
380 ARM_COMPUTE_UNUSED(scale);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100381
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100382 // Create input windows
383 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000384 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
385 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100386
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100387 // Clear X Dimension on execution window as we handle manually
388 win.set(Window::DimX, Window::Dimension(0, 1, 1));
389 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
390 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100391
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000392 Iterator input1(src1, input1_win);
393 Iterator input2(src2, input2_win);
394 Iterator dst(out, win);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100395
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100396 const int window_step_x = 16;
397 const auto window_start_x = static_cast<int>(window.x().start());
398 const auto window_end_x = static_cast<int>(window.x().end());
399
400 execute_window_loop(win, [&](const Coordinates &)
401 {
402 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
403 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000404 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100405
406 // Compute window_step_x elements per iteration
407 int x = window_start_x;
408 for(; x <= (window_end_x - window_step_x); x += window_step_x)
409 {
410 const qsymm16x8x2_t input1_q =
411 {
412 {
413 vld1q_s16(input1_ptr + x),
414 vld1q_s16(input1_ptr + x + 8),
415 }
416 };
417 const qsymm16x8x2_t input2_q =
418 {
419 {
420 vld1q_s16(input2_ptr + x),
421 vld1q_s16(input2_ptr + x + 8),
422 }
423 };
424
425 const int32x4x4_t in1_s32 =
426 {
427 {
428 vmovl_s16(vget_low_s16(input1_q.val[0])),
429 vmovl_s16(vget_high_s16(input1_q.val[0])),
430 vmovl_s16(vget_low_s16(input1_q.val[1])),
431 vmovl_s16(vget_high_s16(input1_q.val[1])),
432 }
433 };
434 const int32x4x4_t in2_s32 =
435 {
436 {
437 vmovl_s16(vget_low_s16(input2_q.val[0])),
438 vmovl_s16(vget_high_s16(input2_q.val[0])),
439 vmovl_s16(vget_low_s16(input2_q.val[1])),
440 vmovl_s16(vget_high_s16(input2_q.val[1])),
441 }
442 };
443
444 const int32x4x4_t result =
445 {
446 {
447 vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
448 vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
449 vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
450 vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
451 }
452 };
453
454 vst1q_s32(output_ptr + x, result.val[0]);
455 vst1q_s32(output_ptr + x + 4, result.val[1]);
456 vst1q_s32(output_ptr + x + 8, result.val[2]);
457 vst1q_s32(output_ptr + x + 12, result.val[3]);
458 }
459
460 // Compute left-over elements
461 for(; x < window_end_x; ++x)
462 {
463 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
464 *(output_ptr + x) = tmp;
465 }
466 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000467 input1, input2, dst);
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100468}
469
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100470template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000471void mul_U8_U8_U8(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100472{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100473 // Create input windows
474 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000475 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
476 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100477
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100478 // Clear X Dimension on execution window as we handle manually
479 win.set(Window::DimX, Window::Dimension(0, 1, 1));
480 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
481 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100482
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000483 Iterator input1(src1, input1_win);
484 Iterator input2(src2, input2_win);
485 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100486
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100487 const int window_step_x = 16 / sizeof(uint8_t);
488 const auto window_start_x = static_cast<int>(window.x().start());
489 const auto window_end_x = static_cast<int>(window.x().end());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100490
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100491 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100492 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100493 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
494 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000495 const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100496
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100497 // Compute window_step_x elements per iteration
498 int x = window_start_x;
499 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100500 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100501 const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
502 const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100503
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100504 uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
505 const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
506 uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
507 const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
508
509 tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
510 tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
511
512 if(is_scale255)
513 {
514 tmp1_high = scale255_U16_U16(tmp1_high);
515 tmp1_low = scale255_U16_U16(tmp1_low);
516 }
517 else
518 {
519 const int16x8_t vn = vdupq_n_s16(-n);
520
521 if(is_sat)
522 {
523 tmp1_high = vqshlq_u16(tmp1_high, vn);
524 tmp1_low = vqshlq_u16(tmp1_low, vn);
525 }
526 else
527 {
528 tmp1_high = vshlq_u16(tmp1_high, vn);
529 tmp1_low = vshlq_u16(tmp1_low, vn);
530 }
531 }
532 if(is_sat)
533 {
534 vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
535 }
536 else
537 {
538 vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
539 }
540 }
541
542 // Compute left-over elements
543 for(; x < window_end_x; ++x)
544 {
545 uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
546
547 if(is_scale255)
548 {
549 float tmp_f = static_cast<float>(tmp) * scale255_constant;
550 tmp = static_cast<uint16_t>(tmp_f + 0.5f);
551 }
552 else
553 {
554 tmp >>= n;
555 }
556 if(is_sat && tmp > 255)
557 {
558 tmp = 255;
559 }
560 *(output_ptr + x) = static_cast<uint8_t>(tmp);
561 }
562 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000563 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100564}
565
566template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000567inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &src1, const int16x8_t &src2, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100568{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000569 int32x4_t tmp1_high = vmovl_s16(vget_high_s16(src1));
570 const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(src2));
571 int32x4_t tmp1_low = vmovl_s16(vget_low_s16(src1));
572 const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(src2));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100573
574 tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
575 tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
576
577 if(is_scale255)
578 {
579 tmp1_high = scale255_S32_S32(tmp1_high);
580 tmp1_low = scale255_S32_S32(tmp1_low);
581 }
582 else
583 {
584 // Right shift amount
585 const int32x4_t vn = vdupq_n_s32(-n);
586 // Left shift amount
587 const int32x4_t vnl = vdupq_n_s32(n);
588 // Calculate conversion bit
589 const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
590 const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
591 const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
592 const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
593 const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
594 const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
595 const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
596 const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
597 if(is_sat)
598 {
599 tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
600 tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
601 }
602 else
603 {
604 tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
605 tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
606 }
607 }
608
609 if(is_sat)
610 {
611 return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
612 }
613 else
614 {
615 return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
616 }
617}
618
619template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000620inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100621{
622 const int16x8x2_t result =
623 {
624 {
625 // First 8 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000626 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100627 // Second 8 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000628 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100629 }
630 };
631
632 return result;
633}
634
635template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000636void mul_S16_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100637{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100638 // Create input windows
639 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000640 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
641 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100642
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100643 // Clear X Dimension on execution window as we handle manually
644 win.set(Window::DimX, Window::Dimension(0, 1, 1));
645 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
646 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100647
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000648 Iterator input1(src1, input1_win);
649 Iterator input2(src2, input2_win);
650 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100651
652 const int window_step_x = 16;
653 const auto window_start_x = static_cast<int>(window.x().start());
654 const auto window_end_x = static_cast<int>(window.x().end());
655
656 execute_window_loop(win, [&](const Coordinates &)
657 {
658 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
659 const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000660 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100661
662 // Compute window_step_x elements per iteration
663 int x = window_start_x;
664 for(; x <= (window_end_x - window_step_x); x += window_step_x)
665 {
666 const int16x8x2_t ta1 =
667 {
668 {
669 vld1q_s16(input1_ptr + x),
670 vld1q_s16(input1_ptr + x + 8),
671 }
672 };
673 const int16x8x2_t ta2 =
674 {
675 {
676 vld1q_s16(input2_ptr + x),
677 vld1q_s16(input2_ptr + x + 8),
678 }
679 };
680 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
681
682 vst1q_s16(output_ptr + x, result.val[0]);
683 vst1q_s16(output_ptr + x + 8, result.val[1]);
684 }
685
686 // Compute left-over elements
687 for(; x < window_end_x; ++x)
688 {
689 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
690
691 if(is_scale255)
692 {
693 float tmp_f = static_cast<float>(tmp) * scale255_constant;
694
695 tmp = static_cast<int32_t>(tmp_f + 0.5f);
696 }
697 else
698 {
699 if(tmp >= 0)
700 {
701 tmp >>= n;
702 }
703 else
704 {
705 uint32_t mask = (1u << n) - 1;
706 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
707 }
708 }
709 if(is_sat)
710 {
711 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
712 }
713 *(output_ptr + x) = static_cast<int16_t>(tmp);
714 }
715 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000716 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100717}
718
SiCong Libb88f892020-08-28 11:18:47 +0100719template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000720inline int32x4_t mul_S32_S32_S32_n_loop(const int32x4_t &src1, const int32x4_t &src2, int n)
SiCong Libb88f892020-08-28 11:18:47 +0100721{
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000722 const int32x2_t input1_1 = vget_low_s32(src1);
723 const int32x2_t input2_1 = vget_low_s32(src2);
724 const int32x2_t input1_2 = vget_high_s32(src1);
725 const int32x2_t input2_2 = vget_high_s32(src2);
SiCong Libb88f892020-08-28 11:18:47 +0100726
727 int64x2_t tmp_1 = vmull_s32(input1_1, input2_1);
728 int64x2_t tmp_2 = vmull_s32(input1_2, input2_2);
729
730 // Apply scaling, conversion and rounding (round to zero)
731 // Right shift amount
732 const int64x2_t vn = vdupq_n_s64(-n);
733 // Left shift amount
734 const int64x2_t vnl = vdupq_n_s64(n);
735 // Calculate conversion bit
736 const uint64x2_t tmp_1_u = vreinterpretq_u64_s64(tmp_1);
737 const uint64x2_t sign_1 = vshrq_n_u64(tmp_1_u, 63);
738 const int64x2_t sign_1_s = vreinterpretq_s64_u64(sign_1);
739 const int64x2_t convert_1 = vsubq_s64(vshlq_s64(sign_1_s, vnl), sign_1_s);
740
741 const uint64x2_t tmp_2_u = vreinterpretq_u64_s64(tmp_2);
742 const uint64x2_t sign_2 = vshrq_n_u64(tmp_2_u, 63);
743 const int64x2_t sign_2_s = vreinterpretq_s64_u64(sign_2);
744 const int64x2_t convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
745 if(is_sat)
746 {
747 tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
748 tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
749 return vcombine_s32(vqmovn_s64(tmp_1), vqmovn_s64(tmp_2));
750 }
751 else
752 {
753 tmp_1 = vshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
754 tmp_2 = vshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
755 return vcombine_s32(vmovn_s64(tmp_1), vmovn_s64(tmp_2));
756 }
757}
758
759template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000760inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
SiCong Libb88f892020-08-28 11:18:47 +0100761{
762 const int32x4x2_t result =
763 {
764 {
765 // First 4 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000766 mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
SiCong Libb88f892020-08-28 11:18:47 +0100767 // Second 4 elements
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000768 mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)
SiCong Libb88f892020-08-28 11:18:47 +0100769 }
770 };
771
772 return result;
773}
774
775template <bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000776void mul_S32_S32_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
SiCong Libb88f892020-08-28 11:18:47 +0100777{
778 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000779 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
780 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
SiCong Libb88f892020-08-28 11:18:47 +0100781
782 // Clear X Dimension on execution window as we handle manually
SiCong Lid6d1b362020-09-24 17:34:23 +0100783 Window win = window;
SiCong Libb88f892020-08-28 11:18:47 +0100784 win.set(Window::DimX, Window::Dimension(0, 1, 1));
SiCong Libb88f892020-08-28 11:18:47 +0100785
SiCong Lid6d1b362020-09-24 17:34:23 +0100786 const int window_step_x = 8;
787 const auto window_start_x = static_cast<int>(window.x().start());
788 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000789 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
SiCong Libb88f892020-08-28 11:18:47 +0100790
SiCong Lid6d1b362020-09-24 17:34:23 +0100791 if(is_broadcast_across_x)
SiCong Libb88f892020-08-28 11:18:47 +0100792 {
SiCong Lid6d1b362020-09-24 17:34:23 +0100793 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
794 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
795 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000796 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
797 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
SiCong Libb88f892020-08-28 11:18:47 +0100798
SiCong Lid6d1b362020-09-24 17:34:23 +0100799 // Clear X Dimension on execution window as we handle manually
800 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
801
802 Iterator broadcast_input(broadcast_tensor, broadcast_win);
803 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000804 Iterator dst(out, win);
SiCong Lid6d1b362020-09-24 17:34:23 +0100805
806 execute_window_loop(win, [&](const Coordinates &)
SiCong Libb88f892020-08-28 11:18:47 +0100807 {
SiCong Lid6d1b362020-09-24 17:34:23 +0100808 const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000809 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
SiCong Libb88f892020-08-28 11:18:47 +0100810
SiCong Lid6d1b362020-09-24 17:34:23 +0100811 const int32_t broadcast_value = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
812 const auto broadcast_value_vec = vdupq_n_s32(broadcast_value);
SiCong Libb88f892020-08-28 11:18:47 +0100813
SiCong Lid6d1b362020-09-24 17:34:23 +0100814 // Compute window_step_x elements per iteration
815 int x = window_start_x;
816 for(; x <= (window_end_x - window_step_x); x += window_step_x)
817 {
818 const int32x4x2_t broadcast_v =
819 {
820 {
821 broadcast_value_vec,
822 broadcast_value_vec,
823 }
824 };
825 const int32x4x2_t non_broadcast_v =
826 {
827 {
828 vld1q_s32(non_broadcast_input_ptr + x),
829 vld1q_s32(non_broadcast_input_ptr + x + 4),
830 }
831 };
832 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
833
834 vst1q_s32(output_ptr + x, result.val[0]);
835 vst1q_s32(output_ptr + x + 4, result.val[1]);
836 }
837
838 // Compute left-over elements
839 for(; x < window_end_x; ++x)
840 {
841 int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
842
843 if(tmp >= 0)
844 {
845 tmp >>= n;
846 }
847 else
848 {
849 uint64_t mask = (1u << n) - 1;
850 tmp = (tmp + static_cast<int64_t>(mask)) >> n;
851 }
852 if(is_sat)
853 {
854 tmp = utility::clamp<int64_t, int32_t>(tmp);
855 }
856 *(output_ptr + x) = static_cast<int32_t>(tmp);
857 }
858 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000859 broadcast_input, non_broadcast_input, dst);
SiCong Lid6d1b362020-09-24 17:34:23 +0100860 }
861 else
862 {
863 // Clear X Dimension on execution window as we handle manually
864 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
865 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
866
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000867 Iterator input1(src1, input1_win);
868 Iterator input2(src2, input2_win);
869 Iterator dst(out, win);
SiCong Lid6d1b362020-09-24 17:34:23 +0100870
871 execute_window_loop(win, [&](const Coordinates &)
SiCong Libb88f892020-08-28 11:18:47 +0100872 {
SiCong Lid6d1b362020-09-24 17:34:23 +0100873 const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
874 const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000875 const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
SiCong Libb88f892020-08-28 11:18:47 +0100876
SiCong Lid6d1b362020-09-24 17:34:23 +0100877 // Compute window_step_x elements per iteration
878 int x = window_start_x;
879 for(; x <= (window_end_x - window_step_x); x += window_step_x)
SiCong Libb88f892020-08-28 11:18:47 +0100880 {
SiCong Lid6d1b362020-09-24 17:34:23 +0100881 const int32x4x2_t ta1 =
882 {
883 {
884 vld1q_s32(input1_ptr + x),
885 vld1q_s32(input1_ptr + x + 4),
886 }
887 };
888 const int32x4x2_t ta2 =
889 {
890 {
891 vld1q_s32(input2_ptr + x),
892 vld1q_s32(input2_ptr + x + 4),
893 }
894 };
895 const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
896
897 vst1q_s32(output_ptr + x, result.val[0]);
898 vst1q_s32(output_ptr + x + 4, result.val[1]);
SiCong Libb88f892020-08-28 11:18:47 +0100899 }
SiCong Lid6d1b362020-09-24 17:34:23 +0100900
901 // Compute left-over elements
902 for(; x < window_end_x; ++x)
SiCong Libb88f892020-08-28 11:18:47 +0100903 {
SiCong Lid6d1b362020-09-24 17:34:23 +0100904 int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
905
906 if(tmp >= 0)
907 {
908 tmp >>= n;
909 }
910 else
911 {
912 uint64_t mask = (1u << n) - 1;
913 tmp = (tmp + static_cast<int64_t>(mask)) >> n;
914 }
915 if(is_sat)
916 {
917 tmp = utility::clamp<int64_t, int32_t>(tmp);
918 }
919 *(output_ptr + x) = static_cast<int32_t>(tmp);
SiCong Libb88f892020-08-28 11:18:47 +0100920 }
SiCong Lid6d1b362020-09-24 17:34:23 +0100921 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000922 input1, input2, dst);
SiCong Lid6d1b362020-09-24 17:34:23 +0100923 }
SiCong Libb88f892020-08-28 11:18:47 +0100924}
925
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000926void mul_F32_F32_F32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100927{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100928 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000929 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
930 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100931
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100932 // Clear X Dimension on execution window as we handle manually
933 Window win = window;
934 win.set(Window::DimX, Window::Dimension(0, 1, 1));
935
936 constexpr int window_step_x = 16 / sizeof(float);
937 const auto window_start_x = static_cast<int>(window.x().start());
938 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000939 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100940
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100941 using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
942
943 if(is_broadcast_across_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100944 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100945 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
946 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
947 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000948 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
949 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100950
951 // Clear X Dimension on execution window as we handle manually
952 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
953
954 Iterator broadcast_input(broadcast_tensor, broadcast_win);
955 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000956 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100957
958 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100959 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100960 const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000961 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100962
963 const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
964 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
965 const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
966
967 // Compute window_step_x elements per iteration
968 int x = window_start_x;
969 for(; x <= (window_end_x - window_step_x); x += window_step_x)
970 {
971 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
972 auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
973 wrapper::vstore(output_ptr + x, res);
974 }
975
976 // Compute left-over elements
977 for(; x < window_end_x; ++x)
978 {
979 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
980 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
981 }
982 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000983 broadcast_input, non_broadcast_input, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100984 }
985 else
986 {
987 // Clear X Dimension on execution window as we handle manually
988 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
989 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
990
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000991 Iterator input1(src1, input1_win);
992 Iterator input2(src2, input2_win);
993 Iterator dst(out, win);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100994
995 execute_window_loop(win, [&](const Coordinates &)
996 {
997 const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
998 const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +0000999 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001000
1001 // Compute window_step_x elements per iteration
1002 int x = window_start_x;
1003 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1004 {
1005 const auto ta1 = wrapper::vloadq(input1_ptr + x);
1006 const auto ta2 = wrapper::vloadq(input2_ptr + x);
1007 const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
1008 const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
1009 wrapper::vstore(output_ptr + x, res);
1010 }
1011
1012 // Compute left-over elements
1013 for(; x < window_end_x; ++x)
1014 {
1015 const auto ta1 = *(input1_ptr + x);
1016 const auto ta2 = *(input2_ptr + x);
1017 *(output_ptr + x) = ta1 * ta2 * scale;
1018 }
1019 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001020 input1, input2, dst);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001021 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001022}
1023
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001024void c_mul_F32_F32_F32_n(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window)
giuros01154bc1c2019-03-26 17:44:40 +00001025{
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001026 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001027 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1028 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00001029
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001030 // Clear X Dimension on execution window as we handle manually
1031 Window win = window;
1032 win.set(Window::DimX, Window::Dimension(0, 1, 1));
giuros01154bc1c2019-03-26 17:44:40 +00001033
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001034 constexpr int window_step_x = 8 / sizeof(float);
1035 const auto window_start_x = static_cast<int>(window.x().start());
1036 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001037 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
giuros01154bc1c2019-03-26 17:44:40 +00001038
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001039 using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
1040
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001041 if(is_broadcast_across_x)
1042 {
1043 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1044 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1045 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001046 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1047 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
giuros01154bc1c2019-03-26 17:44:40 +00001048
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001049 // Clear X Dimension on execution window as we handle manually
1050 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
giuros01154bc1c2019-03-26 17:44:40 +00001051
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001052 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1053 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001054 Iterator dst(out, win);
giuros01154bc1c2019-03-26 17:44:40 +00001055
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001056 execute_window_loop(win, [&](const Coordinates &)
1057 {
1058 const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001059 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
giuros01154bc1c2019-03-26 17:44:40 +00001060
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001061 const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
1062
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001063 // Compute window_step_x elements per iteration
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001064 int x = window_start_x;
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001065 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1066 {
1067 const auto a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
1068 float32x4_t b = vdupq_n_f32(broadcast_value);
1069
1070 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1071 const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
1072 const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
1073 const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
1074 const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
1075
1076 const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
1077 const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
1078
1079 float32x4_t res = wrapper::vmul(tmp0, b);
1080 b = wrapper::vmul(b, mask);
1081
1082 res = wrapper::vmla(res, tmp1, b);
1083 wrapper::vstore(output_ptr + 2 * x, res);
1084 }
1085
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001086 // Compute left-over elements
1087 for(; x < window_end_x; ++x)
1088 {
Sheri Zhang2cb05d92020-11-09 15:12:32 +00001089 const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
1090 const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
1091 auto res1 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
1092 auto res2 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
1093 *(output_ptr + 2 * x) = res1;
1094 *(output_ptr + 2 * x + 1) = res2;
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001095 }
1096 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001097 broadcast_input, non_broadcast_input, dst);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001098 }
1099 else
1100 {
1101 // Clear X Dimension on execution window as we handle manually
1102 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1103 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1104
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001105 Iterator input1(src1, input1_win);
1106 Iterator input2(src2, input2_win);
1107 Iterator dst(out, win);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001108
1109 execute_window_loop(win, [&](const Coordinates &)
1110 {
1111 const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
1112 const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001113 const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001114
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001115 // Compute window_step_x elements per iteration
1116 int x = window_start_x;
1117 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1118 {
1119 const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
1120 float32x4_t b = wrapper::vloadq(input2_ptr + 2 * x);
1121
1122 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
1123 const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
1124 const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
1125 const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
1126 const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
1127
1128 const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
1129 const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
1130
1131 float32x4_t res = wrapper::vmul(tmp0, b);
1132
1133 b = wrapper::vrev64(b);
1134 b = wrapper::vmul(b, mask);
1135
1136 res = wrapper::vmla(res, tmp1, b);
1137 wrapper::vstore(output_ptr + 2 * x, res);
1138 }
1139
1140 // Compute left-over elements
1141 for(; x < window_end_x; ++x)
1142 {
1143 const auto a0 = *(input1_ptr + 2 * x);
1144 const auto a1 = *(input1_ptr + 2 * x + 1);
1145 const auto b0 = *(input2_ptr + 2 * x);
1146 const auto b1 = *(input2_ptr + 2 * x + 1);
1147 auto res1 = a0 * b0 - a1 * b1;
1148 auto res2 = a0 * b1 + a1 * b0;
1149 *(output_ptr + 2 * x) = res1;
1150 *(output_ptr + 2 * x + 1) = res2;
1151 }
1152 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001153 input1, input2, dst);
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001154 }
giuros01154bc1c2019-03-26 17:44:40 +00001155}
1156
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001157#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001158void mul_F16_F16_F16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001159{
1160 // Create input windows
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001161 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1162 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001163
1164 // Clear X Dimension on execution window as we handle manually
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001165 Window win = window;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001166 win.set(Window::DimX, Window::Dimension(0, 1, 1));
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001167 constexpr int window_step_x = 16;
1168 const auto window_start_x = static_cast<int>(window.x().start());
1169 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001170 const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001171 if(is_broadcast_across_x)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001172 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001173 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
1174 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
1175 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001176 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src2 : src1;
1177 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001178 // Clear X Dimension on execution window as we handle manually
1179 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1180 Iterator broadcast_input(broadcast_tensor, broadcast_win);
1181 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001182 Iterator dst(out, win);
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001183 execute_window_loop(win, [&](const Coordinates &)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001184 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001185 const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001186 const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001187 const auto broadcast_value = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
1188 const float16x8x2_t broadcast_value_vec =
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001189 {
1190 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001191 vdupq_n_f16(broadcast_value),
1192 vdupq_n_f16(broadcast_value),
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001193 }
1194 };
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001195 const auto scale_vec = vdupq_n_f16(scale);
1196 // Compute window_step_x elements per iteration
1197 int x = window_start_x;
1198 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001199 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001200 const float16x8x2_t non_broadcast_v =
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001201 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001202 {
1203 vld1q_f16(non_broadcast_input_ptr + x),
1204 vld1q_f16(non_broadcast_input_ptr + x + 8),
1205 }
1206 };
1207 const float16x8x2_t result =
1208 {
1209 {
1210 vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
1211 vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
1212 }
1213 };
1214 vst1q_f16(output_ptr + x, result.val[0]);
1215 vst1q_f16(output_ptr + x + 8, result.val[1]);
1216 }
1217 // Compute left-over elements
1218 for(; x < window_end_x; ++x)
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001219 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001220 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
1221 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
1222 }
1223 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001224 broadcast_input, non_broadcast_input, dst);
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001225 }
1226 else
1227 {
1228 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1229 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001230 Iterator input1(src1, input1_win);
1231 Iterator input2(src2, input2_win);
1232 Iterator dst(out, win);
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001233 execute_window_loop(win, [&](const Coordinates &)
Michele Di Giorgio9428a182020-03-30 14:10:20 +01001234 {
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001235 const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
1236 const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001237 const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001238 // Compute window_step_x elements per iteration
1239 int x = window_start_x;
1240 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1241 {
1242 const float16x8x2_t ta1 =
1243 {
1244 {
1245 vld1q_f16(input1_ptr + x),
1246 vld1q_f16(input1_ptr + x + 8),
1247 }
1248 };
1249 const float16x8x2_t ta2 =
1250 {
1251 {
1252 vld1q_f16(input2_ptr + x),
1253 vld1q_f16(input2_ptr + x + 8),
1254 }
1255 };
1256 const float16x8_t scale_vec = vdupq_n_f16(scale);
1257 const float16x8x2_t result =
1258 {
1259 {
1260 vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
1261 vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
1262 }
1263 };
1264 vst1q_f16(output_ptr + x, result.val[0]);
1265 vst1q_f16(output_ptr + x + 8, result.val[1]);
1266 }
1267 // Compute left-over elements
1268 for(; x < window_end_x; ++x)
1269 {
1270 const auto ta1 = *(input1_ptr + x);
1271 const auto ta2 = *(input2_ptr + x);
1272 *(output_ptr + x) = ta1 * ta2 * scale;
1273 }
1274 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001275 input1, input2, dst);
Pablo Marquez Telloc727d522021-01-27 14:16:13 +00001276 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001277}
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001278#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellodf246182017-07-03 16:25:09 +01001279
1280template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001281void mul_U8_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001282{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001283 // Create input windows
1284 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001285 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1286 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001287
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001288 // Clear X Dimension on execution window as we handle manually
1289 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1290 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1291 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001292
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001293 Iterator input1(src1, input1_win);
1294 Iterator input2(src2, input2_win);
1295 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001296
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001297 const int window_step_x = 16 / sizeof(uint8_t);
1298 const auto window_start_x = static_cast<int>(window.x().start());
1299 const auto window_end_x = static_cast<int>(window.x().end());
1300
1301 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001302 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001303 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
1304 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001305 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001306
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001307 // Compute window_step_x elements per iteration
1308 int x = window_start_x;
1309 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001310 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001311 const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
1312 const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
1313
1314 uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
1315 uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
1316 tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
1317 tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
1318
1319 if(is_scale255)
1320 {
1321 tmp_low = scale255_U16_U16(tmp_low);
1322 tmp_high = scale255_U16_U16(tmp_high);
1323 }
1324 else
1325 {
1326 const int16x8_t vn = vdupq_n_s16(-n);
1327
1328 if(is_sat)
1329 {
1330 tmp_low = vqshlq_u16(tmp_low, vn);
1331 tmp_high = vqshlq_u16(tmp_high, vn);
1332 }
1333 else
1334 {
1335 tmp_low = vshlq_u16(tmp_low, vn);
1336 tmp_high = vshlq_u16(tmp_high, vn);
1337 }
1338 }
1339
1340 if(is_sat)
1341 {
1342 static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
1343
1344 tmp_low = vminq_u16(tmp_low, max);
1345 tmp_high = vminq_u16(tmp_high, max);
1346 }
1347
1348 vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
1349 vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001350 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001351
1352 // Compute left-over elements
1353 for(; x < window_end_x; ++x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001354 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001355 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1356
1357 if(is_scale255)
1358 {
1359 float tmp_f = static_cast<float>(tmp) * scale255_constant;
1360 tmp = static_cast<int32_t>(tmp_f + 0.5f);
1361 }
1362 else
1363 {
1364 tmp >>= n;
1365 }
1366
1367 if(is_sat)
1368 {
1369 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
1370 }
1371
1372 *(output_ptr + x) = static_cast<int16_t>(tmp);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001373 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001374 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001375 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001376}
1377
1378template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001379void mul_S16_U8_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001380{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001381 // Create input windows
1382 Window win = window;
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001383 Window input1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
1384 Window input2_win = window.broadcast_if_dimension_le_one(src2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001385
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001386 // Clear X Dimension on execution window as we handle manually
1387 win.set(Window::DimX, Window::Dimension(0, 1, 1));
1388 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
1389 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001390
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001391 Iterator input1(src1, input1_win);
1392 Iterator input2(src2, input2_win);
1393 Iterator dst(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001394
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001395 const int window_step_x = 16;
1396 const auto window_start_x = static_cast<int>(window.x().start());
1397 const auto window_end_x = static_cast<int>(window.x().end());
1398
1399 execute_window_loop(win, [&](const Coordinates &)
1400 {
1401 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
1402 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001403 const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001404
1405 // Compute window_step_x elements per iteration
1406 int x = window_start_x;
1407 for(; x <= (window_end_x - window_step_x); x += window_step_x)
1408 {
1409 const int16x8x2_t ta1 =
1410 {
1411 {
1412 vld1q_s16(input1_ptr + x),
1413 vld1q_s16(input1_ptr + x + 8),
1414 }
1415 };
1416 const uint8x8x2_t ta2u =
1417 {
1418 {
1419 vld1_u8(input2_ptr + x),
1420 vld1_u8(input2_ptr + x + 8),
1421 }
1422 };
1423 const int16x8x2_t ta2 =
1424 {
1425 {
1426 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
1427 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
1428 }
1429 };
1430
1431 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
1432
1433 vst1q_s16(output_ptr + x, result.val[0]);
1434 vst1q_s16(output_ptr + x + 8, result.val[1]);
1435 }
1436
1437 // Compute left-over elements
1438 for(; x < window_end_x; ++x)
1439 {
1440 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1441
1442 if(is_scale255)
1443 {
1444 float tmp_f = static_cast<float>(tmp) * scale255_constant;
1445
1446 tmp = static_cast<int32_t>(tmp_f + 0.5f);
1447 }
1448 else
1449 {
1450 if(tmp >= 0)
1451 {
1452 tmp >>= n;
1453 }
1454 else
1455 {
1456 uint32_t mask = (1u << n) - 1;
1457 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
1458 }
1459 }
1460 if(is_sat)
1461 {
1462 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
1463 }
1464 *(output_ptr + x) = static_cast<int16_t>(tmp);
1465 }
1466 },
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001467 input1, input2, dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001468}
1469
1470template <bool is_scale255, bool is_sat>
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001471void mul_U8_S16_S16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001472{
1473 // Simply swap the two input buffers
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001474 mul_S16_U8_S16<is_scale255, is_sat>(src2, src1, out, window, n);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001475}
1476} // namespace
1477
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001478void CpuPixelWiseMultiplicationKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001479{
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001480 ARM_COMPUTE_UNUSED(rounding_policy);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001481 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
Georgios Pinitasf0dea702017-07-03 18:17:28 +01001482
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001483 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001484
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001485 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001486
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001487 // Auto initialize dst if not initialized
1488 set_shape_if_empty(*dst, out_shape);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001489
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001490 _scale = scale;
1491 _scale_exponent = 0;
1492 _func_quantized = nullptr;
1493 _func_int = nullptr;
1494 _func_float = nullptr;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001495
1496 bool is_scale_255 = false;
1497 // Check and validate scaling factor
1498 if(std::abs(scale - scale255_constant) < 0.00001f)
1499 {
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001500 is_scale_255 = true;
1501 }
1502 else
1503 {
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001504 int exponent = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001505
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001506 std::frexp(scale, &exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001507
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001508 // Store the positive exponent. We know that we compute 1/2^n
1509 // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
1510 _scale_exponent = std::abs(exponent - 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001511 }
1512
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001513 const DataType dt_input1 = src1->data_type();
1514 const DataType dt_input2 = src2->data_type();
1515 const DataType dt_output = dst->data_type();
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001516 const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
1517
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001518 switch(dt_input1)
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001519 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001520 case DataType::QASYMM8:
1521 if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
1522 {
1523 _func_quantized = &mul_saturate_quantized_8<uint8_t>;
1524 }
1525 break;
1526 case DataType::QASYMM8_SIGNED:
1527 if(dt_input2 == DataType::QASYMM8_SIGNED)
1528 {
1529 _func_quantized = &mul_saturate_quantized_8<int8_t>;
1530 ;
1531 }
1532 break;
1533 case DataType::QSYMM16:
1534 if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
1535 {
1536 _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
1537 }
1538 else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
1539 {
1540 _func_int = &mul_QSYMM16_QSYMM16_S32;
1541 }
1542 break;
1543 case DataType::S16:
1544 if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1545 {
1546 if(is_scale_255)
1547 {
1548 _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
1549 }
1550 else
1551 {
1552 _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
1553 }
1554 }
1555 if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1556 {
1557 if(is_scale_255)
1558 {
1559 _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
1560 }
1561 else
1562 {
1563 _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
1564 }
1565 }
1566 break;
SiCong Libb88f892020-08-28 11:18:47 +01001567 case DataType::S32:
1568 if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
1569 {
1570 _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
1571 }
1572 break;
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001573 case DataType::U8:
1574 if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
1575 {
1576 if(is_scale_255)
1577 {
1578 _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
1579 }
1580 else
1581 {
1582 _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
1583 }
1584 }
1585 else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1586 {
1587 if(is_scale_255)
1588 {
1589 _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
1590 }
1591 else
1592 {
1593 _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
1594 }
1595 }
1596 else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1597 {
1598 if(is_scale_255)
1599 {
1600 _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
1601 }
1602 else
1603 {
1604 _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
1605 }
1606 }
1607 break;
1608#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1609 case DataType::F16:
1610 _func_float = &mul_F16_F16_F16;
1611 break;
1612#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1613 case DataType::F32:
1614 _func_float = &mul_F32_F32_F32;
1615 break;
1616 default:
1617 ARM_COMPUTE_ERROR("You called with the wrong img formats");
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001618 }
1619
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001620 // Configure kernel window
SiCongLic7b1e842021-02-22 14:28:33 +00001621 Window win = calculate_max_window(out_shape);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001622
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001623 ICpuKernel::configure(win);
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001624}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001625
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001626Status CpuPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
1627 RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001628{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001629 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
1630 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001631
Georgios Pinitas631c41a2017-12-06 11:53:03 +00001632 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001633}
1634
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001635void CpuPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001636{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001637 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001638 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001639 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001640
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001641 auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
1642 auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
1643 auto dst = tensors.get_tensor(TensorType::ACL_DST);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001644
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001645 if(_func_quantized != nullptr)
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001646 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001647 (*_func_quantized)(src1, src2, dst, window, _scale);
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001648 }
1649 else if(_func_int != nullptr)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001650 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001651 (*_func_int)(src1, src2, dst, window, _scale_exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001652 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001653 else
1654 {
1655 ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001656 (*_func_float)(src1, src2, dst, window, _scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001657 }
1658}
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001659const char *CpuPixelWiseMultiplicationKernel::name() const
1660{
1661 return "CpuPixelWiseMultiplicationKernel";
1662}
giuros01154bc1c2019-03-26 17:44:40 +00001663namespace
1664{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001665Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00001666{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001667 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F32);
1668 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F32);
giuros01154bc1c2019-03-26 17:44:40 +00001669
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001670 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00001671
1672 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
1673
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001674 // Validate in case of configured dst
1675 if(dst->total_size() > 0)
giuros01154bc1c2019-03-26 17:44:40 +00001676 {
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001677 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);
1678 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
giuros01154bc1c2019-03-26 17:44:40 +00001679 }
1680
1681 return Status{};
1682}
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001683} // namespace
giuros01154bc1c2019-03-26 17:44:40 +00001684
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001685void CpuComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00001686{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001687 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
1688 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst));
Sheri Zhang4d91dc62020-09-23 11:22:50 +01001689
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001690 const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
giuros01154bc1c2019-03-26 17:44:40 +00001691
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001692 // Auto initialize dst if not initialized
1693 const TensorInfo out_info(out_shape, src1->num_channels(), src1->data_type());
1694 auto_init_if_empty(*dst, out_info);
giuros01154bc1c2019-03-26 17:44:40 +00001695
giuros01154bc1c2019-03-26 17:44:40 +00001696 // Configure kernel window
SiCongLic7b1e842021-02-22 14:28:33 +00001697 Window win = calculate_max_window(out_shape);
giuros01154bc1c2019-03-26 17:44:40 +00001698
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001699 ICpuKernel::configure(win);
giuros01154bc1c2019-03-26 17:44:40 +00001700}
1701
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001702Status CpuComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
giuros01154bc1c2019-03-26 17:44:40 +00001703{
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001704 ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
1705 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst));
giuros01154bc1c2019-03-26 17:44:40 +00001706
1707 return Status{};
1708}
1709
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001710void CpuComplexPixelWiseMultiplicationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
giuros01154bc1c2019-03-26 17:44:40 +00001711{
1712 ARM_COMPUTE_UNUSED(info);
1713 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001714 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
giuros01154bc1c2019-03-26 17:44:40 +00001715
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001716 auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
1717 auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
1718 auto dst = tensors.get_tensor(TensorType::ACL_DST);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001719
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001720 c_mul_F32_F32_F32_n(src1, src2, dst, window);
giuros01154bc1c2019-03-26 17:44:40 +00001721}
Sheri Zhang1e3ab422021-03-16 17:35:08 +00001722
1723const char *CpuComplexPixelWiseMultiplicationKernel::name() const
1724{
1725 return "CpuComplexPixelWiseMultiplicationKernel";
1726}
1727} // namespace kernels
1728} // namespace cpu
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001729} // namespace arm_compute