blob: 4b2352f4c2e15a1e405b84543e43347648efa0e2 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgio9428a182020-03-30 14:10:20 +01002 * Copyright (c) 2016-2020 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
25
Anthony Barbiereaefd002018-07-20 17:49:35 +010026#include "arm_compute/core/CPP/Validate.h"
Manuel Bottini79fa9a22019-02-22 17:54:22 +000027#include "arm_compute/core/NEON/NEAsymm.h"
Manuel Bottini7bb56c62019-06-26 15:17:09 +010028#include "arm_compute/core/NEON/NESymm.h"
giuros01154bc1c2019-03-26 17:44:40 +000029#include "arm_compute/core/NEON/wrapper/wrapper.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010030#include "arm_compute/core/TensorInfo.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010031
32#include <arm_neon.h>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010033
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +000034#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tellodf246182017-07-03 16:25:09 +010035#include <arm_fp16.h> // needed for float16_t
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +000036#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellodf246182017-07-03 16:25:09 +010037
Anthony Barbier6ff3b192017-09-04 18:44:23 +010038namespace arm_compute
39{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010040namespace
41{
42const float scale255_constant = 1.f / 255.f;
43const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
44const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f);
45
Georgios Pinitas631c41a2017-12-06 11:53:03 +000046inline Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000047{
48 ARM_COMPUTE_UNUSED(overflow_policy);
49 ARM_COMPUTE_UNUSED(rounding_policy);
50
Anthony Barbiereaefd002018-07-20 17:49:35 +010051 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);
Pablo Tello52ea9c22019-12-10 11:28:53 +000052 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
53 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, DataType::F16, DataType::F32);
Michele Di Giorgio9428a182020-03-30 14:10:20 +010054 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
55 DataType::S16, DataType::QSYMM16,
56 DataType::S32, DataType::F16, DataType::F32);
Georgios Pinitasd7d7e902019-12-18 15:40:54 +000057 if(is_data_type_quantized(input1->data_type()) || is_data_type_quantized(input2->data_type()))
Pablo Tello52ea9c22019-12-10 11:28:53 +000058 {
59 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
Georgios Pinitasd7d7e902019-12-18 15:40:54 +000060 ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
Pablo Tello52ea9c22019-12-10 11:28:53 +000061 }
Manuel Bottini79fa9a22019-02-22 17:54:22 +000062
63 if(output->total_size() > 0)
64 {
Manuel Bottini79fa9a22019-02-22 17:54:22 +000065 const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
66 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
67 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
Michele Di Giorgio9428a182020-03-30 14:10:20 +010068
69 ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::U8 && (input1->data_type() != DataType::U8 || input2->data_type() != DataType::U8),
70 "Output can only be U8 if both inputs are U8");
Michele Di Giorgiof9b595a2020-07-03 13:34:52 +010071 ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8 && (input1->data_type() != DataType::QASYMM8 || input2->data_type() != DataType::QASYMM8),
72 "Output can only be QASYMM8 if both inputs are QASYMM8");
73 ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QASYMM8_SIGNED && (input1->data_type() != DataType::QASYMM8_SIGNED || input2->data_type() != DataType::QASYMM8_SIGNED),
74 "Output can only be QASYMM8 if both inputs are QASYMM8");
75 ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::QSYMM16 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
76 "Output can only be QSYMM16 if both inputs are QSYMM16");
Michele Di Giorgio9428a182020-03-30 14:10:20 +010077 ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && (input1->data_type() != DataType::QSYMM16 || input2->data_type() != DataType::QSYMM16),
78 "Output can only be S32 if both inputs are QSYMM16");
79 ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 output");
Manuel Bottini79fa9a22019-02-22 17:54:22 +000080 }
Michalis Spyrou861f0db2018-02-26 16:47:58 +000081
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +000082 if(std::abs(scale - scale255_constant) < 0.00001f)
83 {
84 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
85 }
86 else
87 {
88 ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO);
89
90 int exponent = 0;
91 const float normalized_mantissa = std::frexp(scale, &exponent);
92
93 // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
94 // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
95 // Moreover, it will be negative as we deal with 1/2^n
96 ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
97 }
98
Georgios Pinitas631c41a2017-12-06 11:53:03 +000099 return Status{};
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +0000100}
101
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100102/* Scales a given vector by 1/255.
103 *
104 * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats.
105 *
106 * @param in Input vector to scale.
107 * @return Scaled output rounded to nearest (round half up).
108 */
109inline int32x4_t scale255_S32_S32(int32x4_t in)
110{
111 // Scale
112 const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q);
113 // Round to nearest (round half up)
114 // Add +0.5 for all values
115 // Afterwards vcvt rounds toward zero
116 return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q));
117}
118
119inline uint16x8_t scale255_U16_U16(uint16x8_t in)
120{
121 const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in))));
122 const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in))));
123 return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1)));
124}
125
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100126template <typename T>
127inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x16_t>::type
128vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000129{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100130 return vquantize_signed(val, info);
Manuel Bottini79fa9a22019-02-22 17:54:22 +0000131}
132
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100133template <typename T>
134inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x16_t>::type
135vquantize(float32x4x4_t val, const UniformQuantizationInfo &info)
Pablo Tello52ea9c22019-12-10 11:28:53 +0000136{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100137 return vquantize(val, info);
Pablo Tello52ea9c22019-12-10 11:28:53 +0000138}
139
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100140template <typename T>
141inline typename std::enable_if<std::is_same<T, int8_t>::value, int8_t>::type
142quantize(float val, const UniformQuantizationInfo &info)
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100143{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100144 int32_t tmp = static_cast<int32_t>(val / info.scale) + info.offset;
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100145
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100146 T tmp_qua = static_cast<T>(tmp > SCHAR_MAX) ? SCHAR_MAX : ((tmp < SCHAR_MIN) ? SCHAR_MIN : tmp);
147 return tmp_qua;
148}
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100149
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100150template <typename T>
151inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8_t>::type
152quantize(float val, const UniformQuantizationInfo &info)
153{
154 int32_t tmp = static_cast<int32_t>(val / info.scale) + info.offset;
155
156 T tmp_qua = static_cast<T>((tmp > UCHAR_MAX) ? UCHAR_MAX : tmp);
157 return tmp_qua;
158}
159
160template <typename T>
161inline float dequantize(const T *input, const UniformQuantizationInfo &info)
162{
163 return static_cast<float>((*input) - info.offset) * info.scale;
164}
165
166template <typename T>
167void mul_saturate_quantized_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
168{
169 const UniformQuantizationInfo input1_qua_info = in1->info()->quantization_info().uniform();
170 const UniformQuantizationInfo input2_qua_info = in2->info()->quantization_info().uniform();
171 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
172
173 // Create input windows
174 Window win = window;
175 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
176 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
177
178 // Clear X Dimension on execution window as we handle manually
179 win.set(Window::DimX, Window::Dimension(0, 1, 1));
180 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
181 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
182
183 Iterator input1(in1, input1_win);
184 Iterator input2(in2, input2_win);
185 Iterator output(out, win);
186
187 const int window_step_x = 16 / sizeof(T);
188 const auto window_start_x = static_cast<int>(window.x().start());
189 const auto window_end_x = static_cast<int>(window.x().end());
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100190
191 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
192
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100193 execute_window_loop(win, [&](const Coordinates &)
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100194 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100195 const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
196 const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
197 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100198
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100199 // Compute window_step_x elements per iteration
200 int x = window_start_x;
201 for(; x <= (window_end_x - window_step_x); x += window_step_x)
202 {
203 const auto input1_q = wrapper::vloadq(input1_ptr + x);
204 const auto input2_q = wrapper::vloadq(input2_ptr + x);
205
206 // Dequantize inputs
207 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
208 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
209
210 const float32x4x4_t out_f32x4x4 =
211 {
212 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
213 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
214 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
215 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
216 };
217
218 // Quantize output
219 const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
220 wrapper::vstore(output_ptr + x, result);
221 }
222
223 // Compute left-over elements
224 for(; x < window_end_x; ++x)
225 {
226 // Dequantize inputs
227 float tmp_in1 = dequantize(input1_ptr + x, input1_qua_info);
228 float tmp_in2 = dequantize(input2_ptr + x, input2_qua_info);
229 float tmp_f = tmp_in1 * tmp_in2;
230
231 // Quantize output
232 const auto tmp_qua = quantize<T>(tmp_f, tmp_qua_info);
233 *(output_ptr + x) = tmp_qua;
234 }
235 },
236 input1, input2, output);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100237}
238
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100239void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
240{
241 const UniformQuantizationInfo input1_qua_info = in1->info()->quantization_info().uniform();
242 const UniformQuantizationInfo input2_qua_info = in2->info()->quantization_info().uniform();
243 const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
244
245 // Create input windows
246 Window win = window;
247 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
248 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
249
250 // Clear X Dimension on execution window as we handle manually
251 win.set(Window::DimX, Window::Dimension(0, 1, 1));
252 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
253 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
254
255 Iterator input1(in1, input1_win);
256 Iterator input2(in2, input2_win);
257 Iterator output(out, win);
258
259 const int window_step_x = 16;
260 const auto window_start_x = static_cast<int>(window.x().start());
261 const auto window_end_x = static_cast<int>(window.x().end());
262
263 const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
264
265 execute_window_loop(win, [&](const Coordinates &)
266 {
267 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
268 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
269 const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
270
271 // Compute window_step_x elements per iteration
272 int x = window_start_x;
273 for(; x <= (window_end_x - window_step_x); x += window_step_x)
274 {
275 const qsymm16x8x2_t input1_q =
276 {
277 {
278 vld1q_s16(input1_ptr + x),
279 vld1q_s16(input1_ptr + x + 8),
280 }
281 };
282 const qsymm16x8x2_t input2_q =
283 {
284 {
285 vld1q_s16(input2_ptr + x),
286 vld1q_s16(input2_ptr + x + 8),
287 }
288 };
289
290 // Dequantize inputs
291 const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
292 const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
293
294 const float32x4x4_t out_f32x4x4 =
295 {
296 vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
297 vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
298 vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
299 vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
300 };
301
302 const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
303 vst1q_s16(output_ptr + x, result.val[0]);
304 vst1q_s16(output_ptr + x + 8, result.val[1]);
305 }
306
307 // Compute left-over elements
308 for(; x < window_end_x; ++x)
309 {
310 // Dequantize inputs
311 float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
312 float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
313 float tmp_f = tmp_in1 * tmp_in2;
314
315 // Quantize output, lrintf() has same rounding mode as vcombine_s16
316 int32_t tmp = lrintf(tmp_f / tmp_qua_info.scale);
317 qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
318 *(output_ptr + x) = tmp_qua;
319 }
320 },
321 input1, input2, output);
322}
323
324void mul_QSYMM16_QSYMM16_S32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int scale)
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100325{
326 ARM_COMPUTE_UNUSED(scale);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100327
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100328 // Create input windows
329 Window win = window;
330 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
331 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100332
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100333 // Clear X Dimension on execution window as we handle manually
334 win.set(Window::DimX, Window::Dimension(0, 1, 1));
335 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
336 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100337
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100338 Iterator input1(in1, input1_win);
339 Iterator input2(in2, input2_win);
340 Iterator output(out, win);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100341
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100342 const int window_step_x = 16;
343 const auto window_start_x = static_cast<int>(window.x().start());
344 const auto window_end_x = static_cast<int>(window.x().end());
345
346 execute_window_loop(win, [&](const Coordinates &)
347 {
348 const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
349 const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
350 const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
351
352 // Compute window_step_x elements per iteration
353 int x = window_start_x;
354 for(; x <= (window_end_x - window_step_x); x += window_step_x)
355 {
356 const qsymm16x8x2_t input1_q =
357 {
358 {
359 vld1q_s16(input1_ptr + x),
360 vld1q_s16(input1_ptr + x + 8),
361 }
362 };
363 const qsymm16x8x2_t input2_q =
364 {
365 {
366 vld1q_s16(input2_ptr + x),
367 vld1q_s16(input2_ptr + x + 8),
368 }
369 };
370
371 const int32x4x4_t in1_s32 =
372 {
373 {
374 vmovl_s16(vget_low_s16(input1_q.val[0])),
375 vmovl_s16(vget_high_s16(input1_q.val[0])),
376 vmovl_s16(vget_low_s16(input1_q.val[1])),
377 vmovl_s16(vget_high_s16(input1_q.val[1])),
378 }
379 };
380 const int32x4x4_t in2_s32 =
381 {
382 {
383 vmovl_s16(vget_low_s16(input2_q.val[0])),
384 vmovl_s16(vget_high_s16(input2_q.val[0])),
385 vmovl_s16(vget_low_s16(input2_q.val[1])),
386 vmovl_s16(vget_high_s16(input2_q.val[1])),
387 }
388 };
389
390 const int32x4x4_t result =
391 {
392 {
393 vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
394 vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
395 vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
396 vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
397 }
398 };
399
400 vst1q_s32(output_ptr + x, result.val[0]);
401 vst1q_s32(output_ptr + x + 4, result.val[1]);
402 vst1q_s32(output_ptr + x + 8, result.val[2]);
403 vst1q_s32(output_ptr + x + 12, result.val[3]);
404 }
405
406 // Compute left-over elements
407 for(; x < window_end_x; ++x)
408 {
409 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
410 *(output_ptr + x) = tmp;
411 }
412 },
413 input1, input2, output);
Manuel Bottini7bb56c62019-06-26 15:17:09 +0100414}
415
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100416template <bool is_scale255, bool is_sat>
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100417void mul_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100418{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100419 // Create input windows
420 Window win = window;
421 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
422 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100423
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100424 // Clear X Dimension on execution window as we handle manually
425 win.set(Window::DimX, Window::Dimension(0, 1, 1));
426 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
427 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100428
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100429 Iterator input1(in1, input1_win);
430 Iterator input2(in2, input2_win);
431 Iterator output(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100432
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100433 const int window_step_x = 16 / sizeof(uint8_t);
434 const auto window_start_x = static_cast<int>(window.x().start());
435 const auto window_end_x = static_cast<int>(window.x().end());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100436
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100437 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100438 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100439 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
440 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
441 const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100442
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100443 // Compute window_step_x elements per iteration
444 int x = window_start_x;
445 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100446 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100447 const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
448 const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100449
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100450 uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1));
451 const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
452 uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1));
453 const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2));
454
455 tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
456 tmp1_low = vmulq_u16(tmp1_low, tmp2_low);
457
458 if(is_scale255)
459 {
460 tmp1_high = scale255_U16_U16(tmp1_high);
461 tmp1_low = scale255_U16_U16(tmp1_low);
462 }
463 else
464 {
465 const int16x8_t vn = vdupq_n_s16(-n);
466
467 if(is_sat)
468 {
469 tmp1_high = vqshlq_u16(tmp1_high, vn);
470 tmp1_low = vqshlq_u16(tmp1_low, vn);
471 }
472 else
473 {
474 tmp1_high = vshlq_u16(tmp1_high, vn);
475 tmp1_low = vshlq_u16(tmp1_low, vn);
476 }
477 }
478 if(is_sat)
479 {
480 vst1q_u8(output_ptr, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
481 }
482 else
483 {
484 vst1q_u8(output_ptr, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
485 }
486 }
487
488 // Compute left-over elements
489 for(; x < window_end_x; ++x)
490 {
491 uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
492
493 if(is_scale255)
494 {
495 float tmp_f = static_cast<float>(tmp) * scale255_constant;
496 tmp = static_cast<uint16_t>(tmp_f + 0.5f);
497 }
498 else
499 {
500 tmp >>= n;
501 }
502 if(is_sat && tmp > 255)
503 {
504 tmp = 255;
505 }
506 *(output_ptr + x) = static_cast<uint8_t>(tmp);
507 }
508 },
509 input1, input2, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100510}
511
512template <bool is_scale255, bool is_sat>
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100513inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n)
514{
515 int32x4_t tmp1_high = vmovl_s16(vget_high_s16(input1));
516 const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(input2));
517 int32x4_t tmp1_low = vmovl_s16(vget_low_s16(input1));
518 const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(input2));
519
520 tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
521 tmp1_low = vmulq_s32(tmp1_low, tmp2_low);
522
523 if(is_scale255)
524 {
525 tmp1_high = scale255_S32_S32(tmp1_high);
526 tmp1_low = scale255_S32_S32(tmp1_low);
527 }
528 else
529 {
530 // Right shift amount
531 const int32x4_t vn = vdupq_n_s32(-n);
532 // Left shift amount
533 const int32x4_t vnl = vdupq_n_s32(n);
534 // Calculate conversion bit
535 const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high);
536 const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low);
537 const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31);
538 const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31);
539 const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high);
540 const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low);
541 const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
542 const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
543 if(is_sat)
544 {
545 tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
546 tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
547 }
548 else
549 {
550 tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
551 tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
552 }
553 }
554
555 if(is_sat)
556 {
557 return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
558 }
559 else
560 {
561 return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high));
562 }
563}
564
565template <bool is_scale255, bool is_sat>
566inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &input1, const int16x8x2_t &input2, int n)
567{
568 const int16x8x2_t result =
569 {
570 {
571 // First 8 elements
572 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[0], input2.val[0], n),
573 // Second 8 elements
574 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(input1.val[1], input2.val[1], n)
575 }
576 };
577
578 return result;
579}
580
581template <bool is_scale255, bool is_sat>
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100582void mul_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100583{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100584 // Create input windows
585 Window win = window;
586 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
587 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100588
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100589 // Clear X Dimension on execution window as we handle manually
590 win.set(Window::DimX, Window::Dimension(0, 1, 1));
591 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
592 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100593
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100594 Iterator input1(in1, input1_win);
595 Iterator input2(in2, input2_win);
596 Iterator output(out, win);
597
598 const int window_step_x = 16;
599 const auto window_start_x = static_cast<int>(window.x().start());
600 const auto window_end_x = static_cast<int>(window.x().end());
601
602 execute_window_loop(win, [&](const Coordinates &)
603 {
604 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
605 const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
606 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
607
608 // Compute window_step_x elements per iteration
609 int x = window_start_x;
610 for(; x <= (window_end_x - window_step_x); x += window_step_x)
611 {
612 const int16x8x2_t ta1 =
613 {
614 {
615 vld1q_s16(input1_ptr + x),
616 vld1q_s16(input1_ptr + x + 8),
617 }
618 };
619 const int16x8x2_t ta2 =
620 {
621 {
622 vld1q_s16(input2_ptr + x),
623 vld1q_s16(input2_ptr + x + 8),
624 }
625 };
626 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
627
628 vst1q_s16(output_ptr + x, result.val[0]);
629 vst1q_s16(output_ptr + x + 8, result.val[1]);
630 }
631
632 // Compute left-over elements
633 for(; x < window_end_x; ++x)
634 {
635 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
636
637 if(is_scale255)
638 {
639 float tmp_f = static_cast<float>(tmp) * scale255_constant;
640
641 tmp = static_cast<int32_t>(tmp_f + 0.5f);
642 }
643 else
644 {
645 if(tmp >= 0)
646 {
647 tmp >>= n;
648 }
649 else
650 {
651 uint32_t mask = (1u << n) - 1;
652 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
653 }
654 }
655 if(is_sat)
656 {
657 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
658 }
659 *(output_ptr + x) = static_cast<int16_t>(tmp);
660 }
661 },
662 input1, input2, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100663}
664
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100665void mul_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100666{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100667 // Create input windows
668 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
669 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100670
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100671 // Clear X Dimension on execution window as we handle manually
672 Window win = window;
673 win.set(Window::DimX, Window::Dimension(0, 1, 1));
674
675 constexpr int window_step_x = 16 / sizeof(float);
676 const auto window_start_x = static_cast<int>(window.x().start());
677 const auto window_end_x = static_cast<int>(window.x().end());
678 const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);
679
680 Iterator input1(in1, window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()));
681 Iterator input2(in2, window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()));
682 Iterator output(out, window);
683
684 using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
685
686 if(is_broadcast_across_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100687 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100688 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
689 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
690 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
691 const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;
692 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
693
694 // Clear X Dimension on execution window as we handle manually
695 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
696
697 Iterator broadcast_input(broadcast_tensor, broadcast_win);
698 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
699 Iterator output(out, win);
700
701 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100702 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100703 const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
704 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
705
706 const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
707 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
708 const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
709
710 // Compute window_step_x elements per iteration
711 int x = window_start_x;
712 for(; x <= (window_end_x - window_step_x); x += window_step_x)
713 {
714 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
715 auto res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
716 wrapper::vstore(output_ptr + x, res);
717 }
718
719 // Compute left-over elements
720 for(; x < window_end_x; ++x)
721 {
722 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
723 *(output_ptr + x) = broadcast_value * non_broadcast_v * scale;
724 }
725 },
726 broadcast_input, non_broadcast_input, output);
727 }
728 else
729 {
730 // Clear X Dimension on execution window as we handle manually
731 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
732 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
733
734 Iterator input1(in1, input1_win);
735 Iterator input2(in2, input2_win);
736 Iterator output(out, win);
737
738 execute_window_loop(win, [&](const Coordinates &)
739 {
740 const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
741 const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
742 const auto output_ptr = reinterpret_cast<float *>(output.ptr());
743
744 // Compute window_step_x elements per iteration
745 int x = window_start_x;
746 for(; x <= (window_end_x - window_step_x); x += window_step_x)
747 {
748 const auto ta1 = wrapper::vloadq(input1_ptr + x);
749 const auto ta2 = wrapper::vloadq(input2_ptr + x);
750 const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
751 const auto res = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
752 wrapper::vstore(output_ptr + x, res);
753 }
754
755 // Compute left-over elements
756 for(; x < window_end_x; ++x)
757 {
758 const auto ta1 = *(input1_ptr + x);
759 const auto ta2 = *(input2_ptr + x);
760 *(output_ptr + x) = ta1 * ta2 * scale;
761 }
762 },
763 input1, input2, output);
764 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100765}
766
giuros01154bc1c2019-03-26 17:44:40 +0000767void c_mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr)
768{
769 const auto input1 = static_cast<const float *__restrict>(input1_ptr);
770 const auto input2 = static_cast<const float *__restrict>(input2_ptr);
771 const auto output = static_cast<float *__restrict>(output_ptr);
772
773 const float32x4_t a = wrapper::vloadq(input1);
774 float32x4_t b = wrapper::vloadq(input2);
775
776 using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
777
778 const float32x4_t mask = { -1.0f, 1.0f, -1.0f, 1.0f };
779 const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
780 const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
781 const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
782 const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
783
784 const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
785 const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
786
787 float32x4_t res = wrapper::vmul(tmp0, b);
788
789 b = wrapper::vrev64(b);
790 b = wrapper::vmul(b, mask);
791
792 res = wrapper::vmla(res, tmp1, b);
793 wrapper::vstore(output, res);
794}
795
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000796#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100797void mul_F16_F16_F16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, float scale)
798{
799 // Create input windows
800 Window win = window;
801 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
802 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
803
804 // Clear X Dimension on execution window as we handle manually
805 win.set(Window::DimX, Window::Dimension(0, 1, 1));
806 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
807 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
808
809 Iterator input1(in1, input1_win);
810 Iterator input2(in2, input2_win);
811 Iterator output(out, win);
812
813 const int window_step_x = 16;
814 const auto window_start_x = static_cast<int>(window.x().start());
815 const auto window_end_x = static_cast<int>(window.x().end());
816
817 execute_window_loop(win, [&](const Coordinates &)
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100818 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100819 const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
820 const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
821 const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
822
823 // Compute window_step_x elements per iteration
824 int x = window_start_x;
825 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100826 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100827 const float16x8x2_t ta1 =
828 {
829 {
830 vld1q_f16(input1_ptr + x),
831 vld1q_f16(input1_ptr + x + 8),
832 }
833 };
834 const float16x8x2_t ta2 =
835 {
836 {
837 vld1q_f16(input2_ptr + x),
838 vld1q_f16(input2_ptr + x + 8),
839 }
840 };
841 const float16x8_t scale_vec = vdupq_n_f16(scale);
842 const float16x8x2_t result =
843 {
844 {
845 vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
846 vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
847 }
848 };
849 vst1q_f16(output_ptr + x, result.val[0]);
850 vst1q_f16(output_ptr + x + 8, result.val[1]);
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100851 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100852
853 // Compute left-over elements
854 for(; x < window_end_x; ++x)
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100855 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100856 const auto ta1 = *(input1_ptr + x);
857 const auto ta2 = *(input2_ptr + x);
858 *(output_ptr + x) = ta1 * ta2 * scale;
Michele Di Giorgio9428a182020-03-30 14:10:20 +0100859 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100860 },
861 input1, input2, output);
862}
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000863#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tellodf246182017-07-03 16:25:09 +0100864
865template <bool is_scale255, bool is_sat>
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100866void mul_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100867{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100868 // Create input windows
869 Window win = window;
870 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
871 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100872
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100873 // Clear X Dimension on execution window as we handle manually
874 win.set(Window::DimX, Window::Dimension(0, 1, 1));
875 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
876 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100877
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100878 Iterator input1(in1, input1_win);
879 Iterator input2(in2, input2_win);
880 Iterator output(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100881
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100882 const int window_step_x = 16 / sizeof(uint8_t);
883 const auto window_start_x = static_cast<int>(window.x().start());
884 const auto window_end_x = static_cast<int>(window.x().end());
885
886 execute_window_loop(win, [&](const Coordinates &)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100887 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100888 const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
889 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
890 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100891
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100892 // Compute window_step_x elements per iteration
893 int x = window_start_x;
894 for(; x <= (window_end_x - window_step_x); x += window_step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100895 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100896 const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
897 const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
898
899 uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av));
900 uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
901 tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
902 tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
903
904 if(is_scale255)
905 {
906 tmp_low = scale255_U16_U16(tmp_low);
907 tmp_high = scale255_U16_U16(tmp_high);
908 }
909 else
910 {
911 const int16x8_t vn = vdupq_n_s16(-n);
912
913 if(is_sat)
914 {
915 tmp_low = vqshlq_u16(tmp_low, vn);
916 tmp_high = vqshlq_u16(tmp_high, vn);
917 }
918 else
919 {
920 tmp_low = vshlq_u16(tmp_low, vn);
921 tmp_high = vshlq_u16(tmp_high, vn);
922 }
923 }
924
925 if(is_sat)
926 {
927 static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
928
929 tmp_low = vminq_u16(tmp_low, max);
930 tmp_high = vminq_u16(tmp_high, max);
931 }
932
933 vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
934 vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100935 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100936
937 // Compute left-over elements
938 for(; x < window_end_x; ++x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100939 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100940 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
941
942 if(is_scale255)
943 {
944 float tmp_f = static_cast<float>(tmp) * scale255_constant;
945 tmp = static_cast<int32_t>(tmp_f + 0.5f);
946 }
947 else
948 {
949 tmp >>= n;
950 }
951
952 if(is_sat)
953 {
954 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
955 }
956
957 *(output_ptr + x) = static_cast<int16_t>(tmp);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100958 }
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100959 },
960 input1, input2, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100961}
962
963template <bool is_scale255, bool is_sat>
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100964void mul_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100965{
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100966 // Create input windows
967 Window win = window;
968 Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
969 Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100970
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100971 // Clear X Dimension on execution window as we handle manually
972 win.set(Window::DimX, Window::Dimension(0, 1, 1));
973 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
974 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100975
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100976 Iterator input1(in1, input1_win);
977 Iterator input2(in2, input2_win);
978 Iterator output(out, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100979
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +0100980 const int window_step_x = 16;
981 const auto window_start_x = static_cast<int>(window.x().start());
982 const auto window_end_x = static_cast<int>(window.x().end());
983
984 execute_window_loop(win, [&](const Coordinates &)
985 {
986 const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
987 const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
988 const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
989
990 // Compute window_step_x elements per iteration
991 int x = window_start_x;
992 for(; x <= (window_end_x - window_step_x); x += window_step_x)
993 {
994 const int16x8x2_t ta1 =
995 {
996 {
997 vld1q_s16(input1_ptr + x),
998 vld1q_s16(input1_ptr + x + 8),
999 }
1000 };
1001 const uint8x8x2_t ta2u =
1002 {
1003 {
1004 vld1_u8(input2_ptr + x),
1005 vld1_u8(input2_ptr + x + 8),
1006 }
1007 };
1008 const int16x8x2_t ta2 =
1009 {
1010 {
1011 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
1012 vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
1013 }
1014 };
1015
1016 const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
1017
1018 vst1q_s16(output_ptr + x, result.val[0]);
1019 vst1q_s16(output_ptr + x + 8, result.val[1]);
1020 }
1021
1022 // Compute left-over elements
1023 for(; x < window_end_x; ++x)
1024 {
1025 int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
1026
1027 if(is_scale255)
1028 {
1029 float tmp_f = static_cast<float>(tmp) * scale255_constant;
1030
1031 tmp = static_cast<int32_t>(tmp_f + 0.5f);
1032 }
1033 else
1034 {
1035 if(tmp >= 0)
1036 {
1037 tmp >>= n;
1038 }
1039 else
1040 {
1041 uint32_t mask = (1u << n) - 1;
1042 tmp = (tmp + static_cast<int32_t>(mask)) >> n;
1043 }
1044 }
1045 if(is_sat)
1046 {
1047 tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
1048 }
1049 *(output_ptr + x) = static_cast<int16_t>(tmp);
1050 }
1051 },
1052 input1, input2, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001053}
1054
1055template <bool is_scale255, bool is_sat>
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001056void mul_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, int n)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001057{
1058 // Simply swap the two input buffers
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001059 mul_S16_U8_S16<is_scale255, is_sat>(in2, in1, out, window, n);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001060}
1061} // namespace
1062
1063NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel()
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001064 : _func_float(nullptr), _func_int(nullptr), _func_quantized(nullptr), _scale{ 0 }, _scale_exponent{ 0 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001065{
1066}
1067
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001068void NEPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001069{
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001070 ARM_COMPUTE_UNUSED(rounding_policy);
Georgios Pinitasf0dea702017-07-03 18:17:28 +01001071 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
1072
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001073 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001074
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001075 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001076 const TensorShape &out_shape = broadcast_pair.first;
1077 const ValidRegion &valid_region = broadcast_pair.second;
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001078
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001079 // Auto initialize output if not initialized
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001080 set_shape_if_empty(*output, out_shape);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001081
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001082 _scale = scale;
1083 _scale_exponent = 0;
1084 _func_quantized = nullptr;
1085 _func_int = nullptr;
1086 _func_float = nullptr;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001087
1088 bool is_scale_255 = false;
1089 // Check and validate scaling factor
1090 if(std::abs(scale - scale255_constant) < 0.00001f)
1091 {
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001092 is_scale_255 = true;
1093 }
1094 else
1095 {
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001096 int exponent = 0;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001097
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001098 std::frexp(scale, &exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001099
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001100 // Store the positive exponent. We know that we compute 1/2^n
1101 // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
1102 _scale_exponent = std::abs(exponent - 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001103 }
1104
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001105 const DataType dt_input1 = input1->data_type();
1106 const DataType dt_input2 = input2->data_type();
1107 const DataType dt_output = output->data_type();
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001108 const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE);
1109
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001110 switch(dt_input1)
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001111 {
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001112 case DataType::QASYMM8:
1113 if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
1114 {
1115 _func_quantized = &mul_saturate_quantized_8<uint8_t>;
1116 }
1117 break;
1118 case DataType::QASYMM8_SIGNED:
1119 if(dt_input2 == DataType::QASYMM8_SIGNED)
1120 {
1121 _func_quantized = &mul_saturate_quantized_8<int8_t>;
1122 ;
1123 }
1124 break;
1125 case DataType::QSYMM16:
1126 if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
1127 {
1128 _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
1129 }
1130 else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
1131 {
1132 _func_int = &mul_QSYMM16_QSYMM16_S32;
1133 }
1134 break;
1135 case DataType::S16:
1136 if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1137 {
1138 if(is_scale_255)
1139 {
1140 _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
1141 }
1142 else
1143 {
1144 _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
1145 }
1146 }
1147 if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1148 {
1149 if(is_scale_255)
1150 {
1151 _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
1152 }
1153 else
1154 {
1155 _func_int = is_sat ? &mul_S16_S16_S16<false, true> : &mul_S16_S16_S16<false, false>;
1156 }
1157 }
1158 break;
1159 case DataType::U8:
1160 if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
1161 {
1162 if(is_scale_255)
1163 {
1164 _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
1165 }
1166 else
1167 {
1168 _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
1169 }
1170 }
1171 else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
1172 {
1173 if(is_scale_255)
1174 {
1175 _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
1176 }
1177 else
1178 {
1179 _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
1180 }
1181 }
1182 else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
1183 {
1184 if(is_scale_255)
1185 {
1186 _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
1187 }
1188 else
1189 {
1190 _func_int = is_sat ? &mul_U8_S16_S16<false, true> : &mul_U8_S16_S16<false, false>;
1191 }
1192 }
1193 break;
1194#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1195 case DataType::F16:
1196 _func_float = &mul_F16_F16_F16;
1197 break;
1198#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1199 case DataType::F32:
1200 _func_float = &mul_F32_F32_F32;
1201 break;
1202 default:
1203 ARM_COMPUTE_ERROR("You called with the wrong img formats");
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001204 }
1205
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001206 // Configure kernel window
1207 Coordinates coord;
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001208 coord.set_num_dimensions(output->num_dimensions());
1209 output->set_valid_region(valid_region);
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001210 Window win = calculate_max_window(valid_region, Steps());
1211
1212 INEKernel::configure(win);
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001213}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001214
Georgios Pinitas631c41a2017-12-06 11:53:03 +00001215Status NEPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy,
1216 RoundingPolicy rounding_policy)
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001217{
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001218 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
Ioan-Cristian Szabo754e9522017-11-28 18:29:43 +00001219 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, scale, overflow_policy, rounding_policy));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001220
Georgios Pinitas631c41a2017-12-06 11:53:03 +00001221 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001222}
1223
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001224void NEPixelWiseMultiplicationKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001225{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001226 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001227 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1228 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1229
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001230 auto input1 = inputs.at(TensorType::ACL_SRC_0);
1231 auto input2 = inputs.at(TensorType::ACL_SRC_1);
1232 auto output = outputs.at(TensorType::ACL_DST);
1233
Sheri Zhangfcf6f4e2020-06-25 20:01:00 +01001234 if(_func_quantized != nullptr)
Michalis Spyrou861f0db2018-02-26 16:47:58 +00001235 {
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001236 (*_func_quantized)(input1, input2, output, window, _scale);
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001237 }
1238 else if(_func_int != nullptr)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001239 {
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001240 (*_func_int)(input1, input2, output, window, _scale_exponent);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001241 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001242 else
1243 {
1244 ARM_COMPUTE_ERROR_ON(_func_float == nullptr);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001245 (*_func_float)(input1, input2, output, window, _scale);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001246 }
1247}
giuros01154bc1c2019-03-26 17:44:40 +00001248namespace
1249{
1250constexpr unsigned int num_elems_processed_per_iteration_complex = 2;
1251
1252Status validate_arguments_complex(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
1253{
1254 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 2, DataType::F32);
1255 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 2, DataType::F32);
1256
1257 const TensorShape &out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
1258
1259 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
1260
1261 // Validate in case of configured output
1262 if(output->total_size() > 0)
1263 {
1264 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 2, DataType::F32);
1265 ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0), "Wrong shape for output");
1266 }
1267
1268 return Status{};
1269}
1270
1271std::pair<Status, Window> validate_and_configure_window_complex(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
1272{
1273 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
1274 const TensorShape &out_shape = broadcast_pair.first;
1275 const ValidRegion &valid_region = broadcast_pair.second;
1276
1277 // Auto initialize output if not initialized
1278 const TensorInfo out_info(out_shape, input1->num_channels(), input1->data_type());
1279 auto_init_if_empty(*output, out_info);
1280
1281 Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration_complex));
1282 Window win_input1 = win.broadcast_if_dimension_le_one(*input1);
1283 Window win_input2 = win.broadcast_if_dimension_le_one(*input2);
1284
1285 AccessWindowHorizontal input1_access(input1, 0, num_elems_processed_per_iteration_complex);
1286 AccessWindowHorizontal input2_access(input2, 0, num_elems_processed_per_iteration_complex);
1287 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_complex);
1288
1289 bool window_changed = update_window_and_padding(win_input1, input1_access)
1290 || update_window_and_padding(win_input2, input2_access)
1291 || update_window_and_padding(win, output_access);
1292
1293 output_access.set_valid_region(win, valid_region);
1294
1295 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
1296 return std::make_pair(err, win);
1297}
1298} // namespace
1299
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001300void NEComplexPixelWiseMultiplicationKernel::configure(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
giuros01154bc1c2019-03-26 17:44:40 +00001301{
1302 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001303 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(input1, input2, output));
giuros01154bc1c2019-03-26 17:44:40 +00001304
1305 // Configure kernel window
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001306 auto win_config = validate_and_configure_window_complex(input1, input2, output);
giuros01154bc1c2019-03-26 17:44:40 +00001307 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
1308
giuros01154bc1c2019-03-26 17:44:40 +00001309 // Create kernel
1310 INEKernel::configure(win_config.second);
1311}
1312
1313Status NEComplexPixelWiseMultiplicationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
1314{
1315 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
1316 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(input1, input2, output));
1317 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_complex(input1->clone().get(), input2->clone().get(), output->clone().get()).first);
1318
1319 return Status{};
1320}
1321
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001322void NEComplexPixelWiseMultiplicationKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info)
giuros01154bc1c2019-03-26 17:44:40 +00001323{
1324 ARM_COMPUTE_UNUSED(info);
1325 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1326 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1327
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001328 auto input1 = inputs.at(TensorType::ACL_SRC_0);
1329 auto input2 = inputs.at(TensorType::ACL_SRC_1);
1330 auto output = outputs.at(TensorType::ACL_DST);
1331
1332 Iterator input1_it(input1, window.broadcast_if_dimension_le_one(input1->info()->tensor_shape()));
1333 Iterator input2_it(input2, window.broadcast_if_dimension_le_one(input2->info()->tensor_shape()));
1334 Iterator output_it(output, window);
giuros01154bc1c2019-03-26 17:44:40 +00001335
1336 execute_window_loop(window, [&](const Coordinates &)
1337 {
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001338 c_mul_F32_F32_F32_n(input1_it.ptr(), input2_it.ptr(), output_it.ptr());
giuros01154bc1c2019-03-26 17:44:40 +00001339 },
Michalis Spyrou6eb73452020-07-02 17:39:25 +01001340 input1_it, input2_it, output_it);
giuros01154bc1c2019-03-26 17:44:40 +00001341}
Manuel Bottini79fa9a22019-02-22 17:54:22 +00001342} // namespace arm_compute