blob: 27605e15c67ae69ac3058ace49dc125456696253 [file] [log] [blame]
Georgios Pinitas58bce682020-11-13 11:38:58 +00001/*
2 * Copyright (c) 2020 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "src/core/NEON/kernels/NELogicalKernel.h"
25
26#include "arm_compute/core/Helpers.h"
27#include "arm_compute/core/Validate.h"
28#include "src/core/common/Validate.h"
29#include "src/core/helpers/AutoConfiguration.h"
30#include "src/core/helpers/WindowHelpers.h"
31
32#include <arm_neon.h>
33
34namespace arm_compute
35{
36namespace kernels
37{
38namespace
39{
40static const uint8x8_t c0_x8 = vdup_n_u8(0);
41static const uint8x16_t c0_x16 = vdupq_n_u8(0);
42static const uint8x8_t c1_x8 = vdup_n_u8(1);
43static const uint8x16_t c1_x16 = vdupq_n_u8(1);
44static const int step = 16;
45static const int half_step = step / 2;
46
47void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int len)
48{
49 ARM_COMPUTE_ASSERT_NOT_NULLPTR(src0);
50 ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
51 ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
52 ARM_COMPUTE_ASSERT(len >= 0);
53
54 for(; len >= step; len -= step)
55 {
56 vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
57 src0 += step;
58 src1 += step;
59 dst += step;
60 }
61
62 for(; len >= half_step; len -= half_step)
63 {
64 vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
65 src0 += half_step;
66 src1 += half_step;
67 dst += half_step;
68 }
69
70 for(; len > 0; --len)
71 {
72 *dst = (*src0) && (*src1);
73 ++src0;
74 ++src1;
75 ++dst;
76 }
77}
78
79void neon_logical_and_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_t *dst, int len)
80{
81 ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
82 ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
83 ARM_COMPUTE_ASSERT(len >= 0);
84
85 const auto broadcast_val_clamped_s = std::min<uint8_t>(broadcast_val, 1);
86 const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
87 const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s);
88
89 for(; len >= step; len -= step)
90 {
91 vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
92 src += step;
93 dst += step;
94 }
95
96 for(; len >= half_step; len -= half_step)
97 {
98 vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
99 src += half_step;
100 dst += half_step;
101 }
102
103 for(; len > 0; --len)
104 {
105 *dst = (*src) && broadcast_val_clamped_s;
106 ++src;
107 ++dst;
108 }
109}
110
111void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int len)
112{
113 ARM_COMPUTE_ASSERT_NOT_NULLPTR(src0);
114 ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
115 ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
116 ARM_COMPUTE_ASSERT(len >= 0);
117
118 for(; len >= step; len -= step)
119 {
120 vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
121 src0 += step;
122 src1 += step;
123 dst += step;
124 }
125
126 for(; len >= half_step; len -= half_step)
127 {
128 vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
129 src0 += half_step;
130 src1 += half_step;
131 dst += half_step;
132 }
133
134 for(; len > 0; --len)
135 {
136 *dst = (*src0) || (*src1);
137 ++src0;
138 ++src1;
139 ++dst;
140 }
141}
142
143void neon_logical_or_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_t *dst, int len)
144{
145 ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
146 ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
147 ARM_COMPUTE_ASSERT(len >= 0);
148
149 const auto broadcast_val_clamped_s = std::min<uint8_t>(broadcast_val, 1);
150 const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
151 const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s);
152
153 for(; len >= step; len -= step)
154 {
155 vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
156 src += step;
157 dst += step;
158 }
159
160 for(; len >= half_step; len -= half_step)
161 {
162 vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
163 src += half_step;
164 dst += half_step;
165 }
166
167 for(; len > 0; --len)
168 {
169 *dst = (*src) || broadcast_val_clamped_s;
170 ++src;
171 ++dst;
172 }
173}
174
175void neon_logical_not(const uint8_t *src, uint8_t *dst, int len)
176{
177 ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
178 ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
179 ARM_COMPUTE_ASSERT(len >= 0);
180
181 for(; len >= step; len -= step)
182 {
183 vst1q_u8(dst, vbslq_u8(vceqq_u8(vld1q_u8(src), c0_x16), c1_x16, c0_x16));
184 src += step;
185 dst += step;
186 }
187
188 for(; len >= half_step; len -= half_step)
189 {
190 vst1_u8(dst, vbsl_u8(vceq_u8(vld1_u8(src), c0_x8), c1_x8, c0_x8));
191 src += half_step;
192 dst += half_step;
193 }
194
195 for(; len > 0; --len)
196 {
197 *dst = !(*src);
198 ++src;
199 ++dst;
200 }
201}
202
203void run_unary(const Window &window, const ITensor *src, ITensor *dst)
204{
205 Window win{ window };
206 win.set(Window::DimX, Window::Dimension(0, 1, 1));
207 const auto len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
208
209 Iterator in(src, win);
210 Iterator out(dst, win);
211
212 execute_window_loop(win, [&](const Coordinates &)
213 {
214 neon_logical_not(in.ptr(), out.ptr(), len);
215 },
216 in, out);
217}
218
219void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, ITensor *dst, LogicalOperation op)
220{
221 Window src0_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
222 Window src1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
223
224 Window win{ window };
225 win.set(Window::DimX, Window::Dimension(0, 1, 1));
226
227 const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
228 const auto len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
229
230 if(is_broadcast_across_x)
231 {
232 using LogicalBroadcastUKernelPtr = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, int)>::type;
233 LogicalBroadcastUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
234
235 const bool is_broadcast_input_1 = src1_win.x().step() == 0;
236 Window broadcast_win = is_broadcast_input_1 ? src1_win : src0_win;
237 Window non_broadcast_win = !is_broadcast_input_1 ? src1_win : src0_win;
238 const ITensor *broadcast_tensor = is_broadcast_input_1 ? src1 : src0;
239 const ITensor *non_broadcast_tensor = !is_broadcast_input_1 ? src1 : src0;
240 non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
241
242 Iterator broadcast_in(broadcast_tensor, broadcast_win);
243 Iterator non_broadcast_in(non_broadcast_tensor, non_broadcast_win);
244 Iterator out(dst, win);
245
246 execute_window_loop(win, [&](const Coordinates &)
247 {
248 const uint8_t broadcast_value = *broadcast_in.ptr();
249 logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len);
250
251 },
252 broadcast_in, non_broadcast_in, out);
253 }
254 else
255 {
256 using LogicalUKernelPtr = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, int)>::type;
257 LogicalUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or : &neon_logical_and;
258
259 src0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
260 src1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
261
262 Iterator in0(src0, src0_win);
263 Iterator in1(src1, src1_win);
264 Iterator out(dst, win);
265 execute_window_loop(win, [&](const Coordinates &)
266 {
267 logical_func(in0.ptr(), in1.ptr(), out.ptr(), len);
268 },
269 in0, in1, out);
270 }
271}
272} // namespace
273const char *NELogicalKernel::name() const
274{
275 return "NELogicalKernel";
276}
277
278void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, LogicalOperation op)
279{
280 ARM_COMPUTE_ERROR_ON_NULLPTR(input1, output);
281 ARM_COMPUTE_ERROR_THROW_ON(validate(input1, input2, output, op));
282
283 _op = op;
284
285 Window win = calculate_max_window(*input1, Steps());
286 TensorShape out_shape = input1->tensor_shape();
287 if(op != LogicalOperation::Not)
288 {
289 ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
290 const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
291 out_shape = broadcast_pair.first;
292 win = calculate_max_window(broadcast_pair.second, Steps());
293 }
294 ICPPKernel::configure(win);
295
296 // Auto initialize if empty
297 set_shape_if_empty(*output, out_shape);
298 set_data_type_if_unknown(*output, input1->data_type());
299}
300
301Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op)
302{
303 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
304 ARM_COMPUTE_RETURN_ERROR_ON(op == LogicalOperation::Unknown);
305
306 TensorShape out_shape = input1->tensor_shape();
307 if(op != LogicalOperation::Not)
308 {
309 out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
310 ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
311 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
312 }
313
314 // Checks performed when output is configured
315 if((output != nullptr) && (output->total_size() != 0))
316 {
317 ARM_COMPUTE_RETURN_ERROR_ON(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0));
318 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
319 }
320
321 return Status{};
322}
323
324void NELogicalKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
325{
326 ARM_COMPUTE_UNUSED(info);
327 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
328 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
329 ARM_COMPUTE_ERROR_ON(tensors.empty());
330
331 const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
332 const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
333 ITensor *dst = tensors.get_tensor(TensorType::ACL_DST);
334
335 if(_op == LogicalOperation::Not)
336 {
337 run_unary(window, src0, dst);
338 }
339 else
340 {
341 run_binary(window, src0, src1, dst, _op);
342 }
343}
344} // namespace kernels
345} // namespace arm_compute