blob: ad3e445ab0267c26702949c68a5f3b8f25cb882c [file] [log] [blame]
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +00001/*
2 * Copyright (c) 2020-2021 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000024
Dana Zlotnikbd2942d2021-11-15 08:46:04 +020025#include "src/cpu/kernels/add/generic/neon/impl.h"
26#include "arm_compute/core/Helpers.h"
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000027#include "arm_compute/core/utils/misc/Traits.h"
28#include "src/core/NEON/wrapper/wrapper.h"
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000029namespace arm_compute
30{
31namespace cpu
32{
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000033template <typename ScalarType>
Sheri Zhang61243902021-01-12 18:25:16 +000034void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000035{
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +000036 /** SIMD vector tag type. */
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000037 using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
38
39 // Create input windows
Sheri Zhang61243902021-01-12 18:25:16 +000040 Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
41 Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000042
43 // Clear X Dimension on execution window as we handle manually
44 Window win = window;
45 win.set(Window::DimX, Window::Dimension(0, 1, 1));
46
47 constexpr int window_step_x = 16 / sizeof(ScalarType);
48 const auto window_start_x = static_cast<int>(window.x().start());
49 const auto window_end_x = static_cast<int>(window.x().end());
Sheri Zhang61243902021-01-12 18:25:16 +000050 const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000051
52 if(is_broadcast_across_x)
53 {
54 const bool is_broadcast_input_2 = input2_win.x().step() == 0;
55 Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;
56 Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;
Sheri Zhang61243902021-01-12 18:25:16 +000057 const ITensor *broadcast_tensor = is_broadcast_input_2 ? src1 : src0;
58 const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000059
60 // Clear X Dimension on execution window as we handle manually
61 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
62
63 Iterator broadcast_input(broadcast_tensor, broadcast_win);
64 Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
Sheri Zhang61243902021-01-12 18:25:16 +000065 Iterator output(dst, win);
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +000066
67 execute_window_loop(win, [&](const Coordinates &)
68 {
69 const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
70 const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
71
72 const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
73 const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
74
75 // Compute S elements per iteration
76 int x = window_start_x;
77 for(; x <= (window_end_x - window_step_x); x += window_step_x)
78 {
79 const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
80 const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
81 wrapper::vstore(output_ptr + x, res);
82 }
83
84 // Compute left-over elements
85 for(; x < window_end_x; ++x)
86 {
87 const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
88 *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
89 }
90 },
91 broadcast_input, non_broadcast_input, output);
92 }
93 else
94 {
95 // Clear X Dimension on execution window as we handle manually
96 input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
97 input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
98
Sheri Zhang61243902021-01-12 18:25:16 +000099 Iterator input1(src0, input1_win);
100 Iterator input2(src1, input2_win);
101 Iterator output(dst, win);
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +0000102
103 execute_window_loop(win, [&](const Coordinates &)
104 {
105 const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
106 const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
107 const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
108
109 // Compute S elements per iteration
110 int x = window_start_x;
111 for(; x <= (window_end_x - window_step_x); x += window_step_x)
112 {
113 const auto val1 = wrapper::vloadq(input1_ptr + x);
114 const auto val2 = wrapper::vloadq(input2_ptr + x);
115 const auto res = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
116 wrapper::vstore(output_ptr + x, res);
117 }
118
119 // Compute left-over elements
120 for(; x < window_end_x; ++x)
121 {
122 const auto val1 = *(input1_ptr + x);
123 const auto val2 = *(input2_ptr + x);
124 *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
125 }
126 },
127 input1, input2, output);
128 }
129}
Dana Zlotnikbd2942d2021-11-15 08:46:04 +0200130
131template void add_same_neon<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
132template void add_same_neon<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
133template void add_same_neon<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
134template void add_same_neon<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
135
136#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
137template void add_same_neon<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
138#endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
139
Michalis Spyroua3c9a3b2020-12-08 21:02:16 +0000140} // namespace cpu
141} // namespace arm_compute