blob: 98b76c7db312d532d54802e26f5abbe86e129f18 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Sheri Zhangac6499a2021-02-10 15:32:38 +00002 * Copyright (c) 2017-2021 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Michalis Spyrouebcebf12020-10-21 00:04:14 +010024#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010025
26#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
27#include "src/core/NEON/wrapper/wrapper.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010028
Anthony Barbier6ff3b192017-09-04 18:44:23 +010029#include "arm_compute/core/Error.h"
30#include "arm_compute/core/Helpers.h"
31#include "arm_compute/core/IAccessWindow.h"
32#include "arm_compute/core/ITensor.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010033#include "arm_compute/core/Types.h"
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +010034#include "arm_compute/core/Utils.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010035#include "arm_compute/core/Validate.h"
Giorgio Arenac0f54432018-03-16 14:02:34 +000036#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010037#include "src/core/AccessWindowStatic.h"
38#include "src/core/CPP/Validate.h"
Georgios Pinitasddb93bb2020-10-02 16:38:59 +010039#include "src/core/NEON/NEFixedPoint.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010040#include "src/core/helpers/AutoConfiguration.h"
41#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010042
43#include <algorithm>
Anthony Barbier6ff3b192017-09-04 18:44:23 +010044
Michalis Spyrou7362f0d2017-10-18 17:58:22 +010045using namespace arm_compute::detail;
Anthony Barbier6ff3b192017-09-04 18:44:23 +010046
Manuel Bottini87350f42020-09-15 13:03:34 +010047namespace arm_compute
48{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010049namespace
50{
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +000051#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Pablo Tello0d176142017-07-06 16:43:14 +010052template <unsigned int stridex>
53float16x8_t internal_vld1q(const float16_t *in);
54
55template <>
56float16x8_t internal_vld1q<1>(const float16_t *in)
57{
58 return vld1q_f16(in);
59}
60
61template <>
62float16x8_t internal_vld1q<2>(const float16_t *in)
63{
64 const float16x8x2_t tmp = vld2q_f16(in);
65 return tmp.val[0];
66}
67
68template <>
69float16x8_t internal_vld1q<3>(const float16_t *in)
70{
71 const float16x8x3_t tmp = vld3q_f16(in);
72 return tmp.val[0];
73}
74
75inline float16x8_t internal_vdupq_n(float16_t v)
76{
77 return vdupq_n_f16(v);
78}
79
80inline void internal_vst1q(float16_t *p, const float16x8_t &v)
81{
82 vst1q_f16(p, v);
83}
84
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +010085float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y)
Pablo Tello0d176142017-07-06 16:43:14 +010086{
Pablo Tello0d176142017-07-06 16:43:14 +010087 return vmulq_f16(x, y);
88}
89
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +010090inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z)
Pablo Tello0d176142017-07-06 16:43:14 +010091{
Pablo Tello0d176142017-07-06 16:43:14 +010092 return vaddq_f16(x, vmulq_f16(y, z));
93}
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +000094#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Pablo Tello0d176142017-07-06 16:43:14 +010095
Anthony Barbier6ff3b192017-09-04 18:44:23 +010096template <unsigned int stridex>
97float32x4_t internal_vld1q(const float *in);
98
99template <>
100float32x4_t internal_vld1q<1>(const float *in)
101{
102 return vld1q_f32(in);
103}
104
105template <>
106float32x4_t internal_vld1q<2>(const float *in)
107{
108 const float32x4x2_t tmp = vld2q_f32(in);
109 return tmp.val[0];
110}
111
112template <>
113float32x4_t internal_vld1q<3>(const float *in)
114{
115 const float32x4x3_t tmp = vld3q_f32(in);
116 return tmp.val[0];
117}
118
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100119inline float32x4_t internal_vdupq_n(float v)
120{
121 return vdupq_n_f32(v);
122}
123
124inline void internal_vst1q(float *p, const float32x4_t &v)
125{
126 vst1q_f32(p, v);
127}
128
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100129float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y)
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100130{
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100131 return vmulq_f32(x, y);
132}
133
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100134inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z)
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100135{
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100136 return vmlaq_f32(x, y, z);
137}
138
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000139constexpr int small_tensor_size_optim = 8;
140inline bool run_optim_small_tensor_info(const ITensorInfo *t)
141{
142 return t->dimension(Window::DimX) <= small_tensor_size_optim && t->dimension(Window::DimY) <= small_tensor_size_optim;
143}
144
Pablo Telloc09314a2017-09-21 13:59:14 +0100145inline bool run_optim_small_tensor(const ITensor *t)
146{
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000147 return run_optim_small_tensor_info(t->info());
Pablo Telloc09314a2017-09-21 13:59:14 +0100148}
149
150// Optimized convolver for 1x1 kernels used only where input width and height are both <= 8
151// For big Z as in Input=7x7x832, this implementation is faster than the general code becuase it doesn't need to
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000152// store intermidiate results in memory. Temporary results are stored in SIMD registers directly and then written to the output buffer.
Pablo Telloc09314a2017-09-21 13:59:14 +0100153template <unsigned int stridex>
154class convolver_w1x1_i8x8_f32
155{
156public:
157 static void convolve(const Window &window, const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
158 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000159 ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimX) > small_tensor_size_optim);
160 ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimY) > small_tensor_size_optim);
Pablo Telloc09314a2017-09-21 13:59:14 +0100161
Georgios Pinitas15997872018-02-19 13:58:22 +0000162 const int input_stride_x = input->info()->strides_in_bytes().x();
Pablo Telloc09314a2017-09-21 13:59:14 +0100163 const int input_stride_y = input->info()->strides_in_bytes().y();
164 const int input_stride_z = input->info()->strides_in_bytes().z();
165 const int output_stride_y = output->info()->strides_in_bytes().y();
166 const int output_stride_z = output->info()->strides_in_bytes().z();
167 const int kernel_stride_z = weights->info()->strides_in_bytes().z();
168 const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
169 const int output_h = output->info()->dimension(1);
170 const int range_z = window.z().end() - window.z().start();
171 const int kernel_depth = weights->info()->dimension(Window::DimZ);
172 const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
Georgios Pinitas15997872018-02-19 13:58:22 +0000173 const unsigned int conv_pad_left = conv_info.pad_left();
174 const unsigned int conv_pad_top = conv_info.pad_top();
Pablo Telloc09314a2017-09-21 13:59:14 +0100175
176 // setup output window for the iterator
177 Window window_out = window;
178 window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
179 window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
180 window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
181
182 // setup input window for the iterator
183 Window window_in = window;
184 // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
185 window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
186 window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
187 window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
188
189 Window window_k = calculate_max_window(*weights->info(), Steps(1u));
190 Iterator out(output, window_out);
191 Iterator in(input, window_in);
192 Iterator k(weights, window_k);
193
194 const uint8_t *k_ptr = k.ptr();
195
196 execute_window_loop(window_out, [&](const Coordinates & id)
197 {
Michalis Spyroubcfd09a2019-05-01 13:03:59 +0100198 const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
199 uint8_t *out_ptr = out.ptr();
200 int ih = 0;
201 int oh = 0;
202 std::array<float32x4_t, 8> accum0 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
203 std::array<float32x4_t, 8> accum1 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
Pablo Telloc09314a2017-09-21 13:59:14 +0100204 for(int oz = 0; oz < range_z; ++oz)
205 {
206 accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f);
207 accum1[0] = accum1[1] = accum1[2] = accum1[3] = accum1[4] = accum1[5] = accum1[6] = accum1[7] = vdupq_n_f32(0.f);
208 auto p_out_base = out_ptr + oz * output_stride_z;
209 for(int p = 0; p < kernel_depth; ++p)
210 {
211 const auto k_val = reinterpret_cast<const float *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
212 const auto vk0 = internal_vdupq_n(*k_val);
213 for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
214 {
215 const int offset_xy = ih * input_stride_y;
216 auto in_val = reinterpret_cast<const float *>(input_ptr + p * input_stride_z + offset_xy);
217 auto v_in0 = internal_vld1q<stridex>(in_val);
218 auto v_in1 = internal_vld1q<stridex>(in_val + 4);
219 accum0[oh] = vmlaq_f32(accum0[oh], vk0, v_in0);
220 accum1[oh] = vmlaq_f32(accum1[oh], vk0, v_in1);
221 }
222 }
223 for(oh = 0; oh < output_h; ++oh)
224 {
225 auto p_out = reinterpret_cast<float *>(p_out_base + oh * output_stride_y);
226 vst1q_f32(p_out, accum0[oh]);
227 vst1q_f32(p_out + 4, accum1[oh]);
228 }
229 }
230 },
231 in, out);
232 }
233};
234
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100235template <typename T1, typename T2, unsigned int stridex>
236class convolver_1x1
237{
238public:
239 static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
240 const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
241 {
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100242 const int input_stride_x = input->info()->strides_in_bytes().x();
243 const int input_stride_y = input->info()->strides_in_bytes().y();
244 const int input_stride_z = input->info()->strides_in_bytes().z();
245 const int output_stride_y = output->info()->strides_in_bytes().y();
246 const int output_stride_z = output->info()->strides_in_bytes().z();
247 const int kernel_stride_z = weights->info()->strides_in_bytes().z();
248 const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
249 const int output_w = output->info()->dimension(0);
250 const int output_h = output->info()->dimension(1);
251 const int range_z = window.z().end() - window.z().start();
252 const int kernel_depth = weights->info()->dimension(Window::DimZ);
253 const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
254 const unsigned int conv_pad_left = conv_info.pad_left();
255 const unsigned int conv_pad_top = conv_info.pad_top();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100256
257 // setup output window for the iterator
258 Window window_out = window;
259 window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
260 window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
261 window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
262
263 // setup input window for the iterator
264 Window window_in = window;
265 // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
266 window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
267 window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
268 window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
269
Pablo Tellof87cc7f2017-07-26 10:28:40 +0100270 Window window_k = calculate_max_window(*weights->info(), Steps(1u));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100271 Iterator out(output, window_out);
272 Iterator in(input, window_in);
273 Iterator k(weights, window_k);
274
275 const uint8_t *k_ptr = k.ptr();
276
277 execute_window_loop(window_out, [&](const Coordinates & id)
278 {
279 /*
280 For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>
281 */
Georgios Pinitas15997872018-02-19 13:58:22 +0000282 const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100283 uint8_t *out_ptr = out.ptr();
284 int ih = 0;
285 int oh = 0;
286 for(int oz = 0; oz < range_z; ++oz)
287 {
288 auto p_out_base = out_ptr + oz * output_stride_z;
289 // Step 1
290 {
291 const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
292 const auto vk = internal_vdupq_n(*k_val);
293 for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
294 {
295 const int offset_xy = ih * input_stride_y;
296 auto in_val = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy));
297 auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
298 for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
299 {
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100300 internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100301 }
302 }
303 }
Pablo Telloc09314a2017-09-21 13:59:14 +0100304
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100305 // Step 2
306 for(int p = 1; p < kernel_depth; ++p)
307 {
308 const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
309 const auto vk = internal_vdupq_n(*k_val);
310 for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
311 {
312 const int offset_xy = ih * input_stride_y;
313 auto in_val = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy);
314 auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
315 for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
316 {
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100317 internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val)));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100318 }
319 }
320 }
321 }
322 },
323 in, out);
324 }
325};
326
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100327template <unsigned int stridex>
Pablo Tello06da39d2017-08-10 15:10:40 +0100328float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100329 const float *m0, const float *m1, const float *m2, const float *m3, const float *m4);
Pablo Tello06da39d2017-08-10 15:10:40 +0100330
331inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2)
332{
333 const float32x4x3_t m00 =
334 {
335 {
336 vld1q_dup_f32(m0),
337 vld1q_dup_f32(m1),
338 vld1q_dup_f32(m2)
339 }
340 };
341 return m00;
342}
343
344inline float32x4x2_t load_matrix_lo(const float *const m3, const float *const m4)
345{
346 const float32x4x2_t m00 =
347 {
348 {
349 vld1q_dup_f32(m3),
350 vld1q_dup_f32(m4)
351 }
352 };
353 return m00;
354}
355
356inline float32x4x3_t load_input(const float *const in)
357{
358 const float32x4x3_t vin =
359 {
360 {
361 vld1q_f32(in),
362 vld1q_f32(in + 4),
363 vld1q_f32(in + 8)
364 }
365 };
366 return vin;
367}
368
369template <>
370inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100371 const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
Pablo Tello06da39d2017-08-10 15:10:40 +0100372{
Pablo Tello06da39d2017-08-10 15:10:40 +0100373 const float32x4x3_t vin0 = load_input(in_0);
374 const float32x4x3_t vin1 = load_input(in_1);
375 const float32x4x3_t vin2 = load_input(in_2);
376 const float32x4x3_t vin3 = load_input(in_3);
377 const float32x4x3_t vin4 = load_input(in_4);
378 const float32x4x3_t m00 = load_matrix_hi(m0, 1 + m0, 2 + m0);
379 const float32x4x2_t m01 = load_matrix_lo(3 + m0, 4 + m0);
380 const float32x4x3_t m10 = load_matrix_hi(m1, 1 + m1, 2 + m1);
381 const float32x4x2_t m11 = load_matrix_lo(3 + m1, 4 + m1);
382 const float32x4x3_t m20 = load_matrix_hi(m2, 1 + m2, 2 + m2);
383 const float32x4x2_t m21 = load_matrix_lo(3 + m2, 4 + m2);
384 const float32x4x3_t m30 = load_matrix_hi(m3, 1 + m3, 2 + m3);
385 const float32x4x2_t m31 = load_matrix_lo(3 + m3, 4 + m3);
386 const float32x4x3_t m40 = load_matrix_hi(m4, 1 + m4, 2 + m4);
387 const float32x4x2_t m41 = load_matrix_lo(3 + m4, 4 + m4);
388
389 float32x4x2_t out =
390 {
391 {
392 vmulq_f32(vin0.val[0], m00.val[0]),
393 vmulq_f32(vin0.val[1], m00.val[0])
394 }
395 };
396
397 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]);
398 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]);
399 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]);
400 out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]);
401
402 out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]);
403 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]);
404 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]);
405 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]);
406 out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]);
407
408 out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]);
409 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]);
410 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]);
411 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]);
412 out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]);
413
414 out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]);
415 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]);
416 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]);
417 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]);
418 out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]);
419
420 out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]);
421 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]);
422 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]);
423 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]);
424 out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]);
425
426 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]);
427 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]);
428 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]);
429 out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]);
430
431 out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]);
432 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]);
433 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]);
434 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]);
435 out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]);
436
437 out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]);
438 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]);
439 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]);
440 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]);
441 out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]);
442
443 out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]);
444 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]);
445 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]);
446 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]);
447 out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]);
448
449 out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]);
450 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]);
451 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]);
452 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]);
453 out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]);
454
455 return out;
456}
457
458template <>
459inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100460 const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
Pablo Tello06da39d2017-08-10 15:10:40 +0100461{
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100462 float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
Pablo Tello06da39d2017-08-10 15:10:40 +0100463 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
464 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
465 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
466 return out;
467}
468
469template <>
470inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100471 const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
Pablo Tello06da39d2017-08-10 15:10:40 +0100472{
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100473 float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
Pablo Tello06da39d2017-08-10 15:10:40 +0100474 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
475 return out;
476}
477
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100478template <typename T1, typename T2, unsigned int stridex>
479class convolver_3x3
480{
481public:
482 static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
483 const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
484 {
485 ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100486 const int input_stride_x = input->info()->strides_in_bytes().x();
487 const int input_stride_y = input->info()->strides_in_bytes().y();
488 const int input_stride_z = input->info()->strides_in_bytes().z();
489 const int output_stride_y = output->info()->strides_in_bytes().y();
490 const int output_stride_z = output->info()->strides_in_bytes().z();
491 const int kernel_stride_x = weights->info()->strides_in_bytes().x();
492 const int kernel_stride_y = weights->info()->strides_in_bytes().y();
493 const int kernel_stride_z = weights->info()->strides_in_bytes().z();
494 const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
495 const int output_w = output->info()->dimension(0);
496 const int output_h = output->info()->dimension(1);
497 const int num_planes_z = window.z().end() - window.z().start();
Michele Di Giorgio13ec5f02020-01-02 12:11:13 +0000498 const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100499 const int kernel_depth = weights->info()->dimension(Window::DimZ);
500 const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
501 const unsigned int conv_pad_left = conv_info.pad_left();
502 const unsigned int conv_pad_top = conv_info.pad_top();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100503
504 // setup output window for the iterator
505 Window window_out = window;
506 window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
507 window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
508 window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
509
510 // setup input window for the iterator
511 Window window_in = window;
512 // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
513 window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
514 window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
515 window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
516
517 Window window_k = calculate_max_window(*weights->info(), Steps(1u));
518
519 Iterator out(output, window_out);
520 Iterator in(input, window_in);
521 Iterator k(weights, window_k);
522
523 const uint8_t *k_ptr = k.ptr();
524
525 execute_window_loop(window_out, [&](const Coordinates & id)
526 {
Georgios Pinitas15997872018-02-19 13:58:22 +0000527 const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100528 uint8_t *out_ptr = out.ptr();
529 int ih = 0;
530 int oh = 0;
531 /*
532 Each thread executing this kernel computes one or more output's volume planes.
533
534 Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15],
535 the third thread [16,24] and the fourth thread [25,31].
536
537 The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this
Anthony Barbiere5007472017-10-27 15:01:44 +0100538 is that we setup the neon registers containing the kernel's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value.
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100539
540 The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages:
541 1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.
542 2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.
543 */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100544 for(int oz = 0; oz < num_planes_z; ++oz)
545 {
Pablo Tello0d176142017-07-06 16:43:14 +0100546 const int zoffset = id.z() + oz;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100547 uint8_t *p_out_base = out_ptr + oz * output_stride_z;
548 // Step 1
549 {
Pablo Tello0d176142017-07-06 16:43:14 +0100550 const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
551 const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
552 const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100553 const auto vk_r0 = load_matrix_row(ptr_k_r0);
554 const auto vk_r1 = load_matrix_row(ptr_k_r1);
555 const auto vk_r2 = load_matrix_row(ptr_k_r2);
556 for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
557 {
558 auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
559 auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
560 auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
561 auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
562 for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
563 in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
564 {
Georgios Pinitasa26e1662020-03-04 15:31:25 +0000565 convolve_3x3<false>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100566 }
567 }
568 }
569 // Step 2
570 for(int p = 1; p < kernel_depth; ++p)
571 {
Pablo Tello06da39d2017-08-10 15:10:40 +0100572 const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w;
573 const uint8_t *input_base = input_ptr + p * input_stride_z;
574 const auto ptr_k_r0 = reinterpret_cast<const T1 *>(ptr_k_base);
575 const auto ptr_k_r1 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y);
576 const auto ptr_k_r2 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y * 2);
577 const auto vk_r0 = load_matrix_row(ptr_k_r0);
578 const auto vk_r1 = load_matrix_row(ptr_k_r1);
579 const auto vk_r2 = load_matrix_row(ptr_k_r2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100580 for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
581 {
Pablo Tello06da39d2017-08-10 15:10:40 +0100582 auto in_top = reinterpret_cast<const T1 *>(input_base + (ih + 0) * input_stride_y);
583 auto in_mid = reinterpret_cast<const T1 *>(input_base + (ih + 1) * input_stride_y);
584 auto in_low = reinterpret_cast<const T1 *>(input_base + (ih + 2) * input_stride_y);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100585 auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
586 for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
587 in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
588 {
Georgios Pinitasa26e1662020-03-04 15:31:25 +0000589 convolve_3x3<true>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100590 }
591 }
592 }
593 }
594 },
595 in, out);
596 }
597};
598
Pablo Tello06da39d2017-08-10 15:10:40 +0100599template <typename T1, typename T2, unsigned int stridex>
600class convolver_5x5
601{
602public:
603 static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
604 const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
605 {
606 ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100607 const int input_stride_x = input->info()->strides_in_bytes().x();
608 const int input_stride_y = input->info()->strides_in_bytes().y();
609 const int input_stride_z = input->info()->strides_in_bytes().z();
610 const int output_stride_y = output->info()->strides_in_bytes().y();
611 const int output_stride_z = output->info()->strides_in_bytes().z();
612 const int kernel_stride_x = weights->info()->strides_in_bytes().x();
613 const int kernel_stride_y = weights->info()->strides_in_bytes().y();
614 const int kernel_stride_z = weights->info()->strides_in_bytes().z();
615 const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
616 const int output_w = output->info()->dimension(0);
617 const int output_h = output->info()->dimension(1);
618 const int num_planes_z = window.z().end() - window.z().start();
Michele Di Giorgio13ec5f02020-01-02 12:11:13 +0000619 const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100620 const int kernel_depth = weights->info()->dimension(Window::DimZ);
621 const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
622 const unsigned int conv_pad_left = conv_info.pad_left();
623 const unsigned int conv_pad_top = conv_info.pad_top();
Pablo Tello06da39d2017-08-10 15:10:40 +0100624
625 // setup output window for the iterator
626 Window window_out = window;
627 window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));
628 window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));
629 window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
630
631 // setup input window for the iterator
632 Window window_in = window;
633 // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
634 window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
635 window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
636 window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
637
638 Window window_k = calculate_max_window(*weights->info(), Steps(1u));
639
640 Iterator out(output, window_out);
641 Iterator in(input, window_in);
642 Iterator k(weights, window_k);
643
644 const uint8_t *k_ptr = k.ptr();
645
646 execute_window_loop(window_out, [&](const Coordinates & id)
647 {
Georgios Pinitas15997872018-02-19 13:58:22 +0000648 const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
Pablo Tello06da39d2017-08-10 15:10:40 +0100649 uint8_t *out_ptr = out.ptr();
650 int ih = 0;
651 int oh = 0;
652 for(int oz = 0; oz < num_planes_z; ++oz)
653 {
654 const int zoffset = id.z() + oz;
655 uint8_t *p_out_base = out_ptr + oz * output_stride_z;
656 // Step 1
657 {
658 const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
659 const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
660 const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
661 const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
662 const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
663 for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
664 {
665 auto in_0 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
666 auto in_1 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
667 auto in_2 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
668 auto in_3 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y);
669 auto in_4 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y);
670 auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
671 for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
672 in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
673 {
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100674 auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
Pablo Tello06da39d2017-08-10 15:10:40 +0100675 store_results<stridex>(p_out, vres);
676 }
677 }
678 }
679 // Step 2
680 for(int p = 1; p < kernel_depth; ++p)
681 {
682 const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
683 const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
684 const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
685 const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
686 const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
687
688 for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
689 {
690 auto in_0 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
691 auto in_1 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
692 auto in_2 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
693 auto in_3 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y);
694 auto in_4 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y);
695 auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
696 for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
697 in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
698 {
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100699 auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
Pablo Tello06da39d2017-08-10 15:10:40 +0100700 accumulate_results<stridex>(p_out, vres);
701 }
702 }
703 }
704 }
705 },
706 in, out);
707 }
708};
709
Gian Marco Iodice95f93612019-06-13 15:58:32 +0100710float vreduce(const float32x4_t &v)
711{
712 auto v0 = wrapper::vgethigh(v);
713 auto v1 = wrapper::vgetlow(v);
714 auto v_out = wrapper::vadd(v0, v1);
715
716 float a = wrapper::vgetlane(v_out, 0);
717 float b = wrapper::vgetlane(v_out, 1);
718 return a + b;
719}
720
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100721template <typename T1, typename T2>
722inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
723 const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
724{
725 const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
726 switch(conv_stride_x)
727 {
728 case 1:
729 convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
730 break;
731 case 2:
732 convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
733 break;
734 case 3:
735 convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
736 break;
737 default:
738 ARM_COMPUTE_ERROR("Not implemented");
739 }
740}
741
Pablo Telloc09314a2017-09-21 13:59:14 +0100742template <>
743inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
744 const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
745{
746 const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
747 if(run_optim_small_tensor(input))
748 {
749 switch(conv_stride_x)
750 {
751 case 1:
752 convolver_w1x1_i8x8_f32<1>::convolve(window, input, weights, output, conv_info);
753 break;
754 case 2:
755 convolver_w1x1_i8x8_f32<2>::convolve(window, input, weights, output, conv_info);
756 break;
757 case 3:
758 convolver_w1x1_i8x8_f32<3>::convolve(window, input, weights, output, conv_info);
759 break;
760 default:
761 ARM_COMPUTE_ERROR("Not implemented");
762 }
763 }
764 else
765 {
766 switch(conv_stride_x)
767 {
768 case 1:
769 convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
770 break;
771 case 2:
772 convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
773 break;
774 case 3:
775 convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
776 break;
777 default:
778 ARM_COMPUTE_ERROR("Not implemented");
779 }
780 }
781}
782
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100783template <typename T1, typename T2>
784inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
785 const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
786{
787 const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
788 switch(conv_stride_x)
789 {
790 case 1:
791 convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
792 break;
793 case 2:
794 convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
795 break;
796 case 3:
797 convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
798 break;
799 default:
800 ARM_COMPUTE_ERROR("Not implemented");
801 }
802}
Pablo Tello06da39d2017-08-10 15:10:40 +0100803
804template <typename T1, typename T2>
805inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
806 const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
807{
808 const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
809 switch(conv_stride_x)
810 {
811 case 1:
812 convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
813 break;
814 case 2:
815 convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
816 break;
817 case 3:
818 convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
819 break;
820 default:
821 ARM_COMPUTE_ERROR("Not implemented");
822 }
823}
824
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000825Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)
826{
827 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
Giorgio Arenac0f54432018-03-16 14:02:34 +0000828 ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
Anthony Barbiereaefd002018-07-20 17:49:35 +0100829 ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100830 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000831 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000832
Giorgio Arenac0f54432018-03-16 14:02:34 +0000833 const DataLayout data_layout = input->data_layout();
834 const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
835 const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
836 const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
837
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000838 ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
Giorgio Arenac0f54432018-03-16 14:02:34 +0000839 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != input->dimension(channel_idx));
840 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000841 ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
Giorgio Arenac0f54432018-03-16 14:02:34 +0000842 ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && input->data_type() != DataType::F32);
Gian Marco Iodice41acb762018-08-23 10:25:06 +0100843 ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (input->data_type() == DataType::F16));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000844
845 // Checks performed when output is configured
846 if(output->total_size() != 0)
847 {
Giorgio Arenac0f54432018-03-16 14:02:34 +0000848 TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000849
850 DataType data_type = input->data_type();
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000851
852 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
853 ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != data_type);
854 }
855
856 return Status{};
857}
858
859std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,
Georgios Pinitas0223a782017-12-12 11:44:44 +0000860 unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000861{
Giorgio Arenac0f54432018-03-16 14:02:34 +0000862 ARM_COMPUTE_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
863
864 const DataLayout data_layout = input->data_layout();
865 const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
866
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000867 // Calculate right and bottom border
Giorgio Arenac0f54432018-03-16 14:02:34 +0000868 unsigned int kernel_size = weights->dimension(width_idx);
Georgios Pinitas1d6d2112018-02-05 17:40:12 +0000869 const int conv_stride_x = std::get<0>(conv_info.stride());
Georgios Pinitas1a03d762018-02-21 14:47:09 +0000870 const int conv_stride_y = std::get<1>(conv_info.stride());
Giorgio Arenac0f54432018-03-16 14:02:34 +0000871 const int input_width = input->dimension(width_idx);
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000872
Giorgio Arenac0f54432018-03-16 14:02:34 +0000873 Window win{};
874 bool window_changed = false;
875
876 if(data_layout == DataLayout::NCHW)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000877 {
Giorgio Arenac0f54432018-03-16 14:02:34 +0000878 switch(kernel_size)
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000879 {
Giorgio Arenac0f54432018-03-16 14:02:34 +0000880 case 1:
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000881 {
Giorgio Arenac0f54432018-03-16 14:02:34 +0000882 switch(input->data_type())
883 {
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000884#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Giorgio Arenac0f54432018-03-16 14:02:34 +0000885 case DataType::F16:
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000886 num_elems_written_per_iteration = 8;
Giorgio Arenac0f54432018-03-16 14:02:34 +0000887 break;
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100888#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Giorgio Arenac0f54432018-03-16 14:02:34 +0000889 case DataType::F32:
890 if(run_optim_small_tensor_info(input))
891 {
892 num_elems_written_per_iteration = 8;
893 }
894 else
895 {
896 num_elems_written_per_iteration = 4;
897 }
898 break;
899 default:
900 ARM_COMPUTE_ERROR("Data type not supported.");
901 break;
902 }
903 num_weight_elems_read_per_row = kernel_size;
904 num_elems_read_per_iteration = conv_stride_x * num_elems_written_per_iteration;
905 break;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000906 }
Giorgio Arenac0f54432018-03-16 14:02:34 +0000907 case 3:
Giorgio Arenac0f54432018-03-16 14:02:34 +0000908 switch(input->data_type())
909 {
910 case DataType::F32:
911 num_weight_elems_read_per_row = 4 + kernel_size - 1;
912 num_elems_read_per_iteration = 12;
913 num_elems_written_per_iteration = 16 >> conv_stride_x;
914 break;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000915#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Giorgio Arenac0f54432018-03-16 14:02:34 +0000916 case DataType::F16:
Giorgio Arenac0f54432018-03-16 14:02:34 +0000917 num_weight_elems_read_per_row = 8 + kernel_size - 1;
918 num_elems_read_per_iteration = 24;
919 num_elems_written_per_iteration = 32 >> conv_stride_x;
920 break;
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100921#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Giorgio Arenac0f54432018-03-16 14:02:34 +0000922 default:
923 ARM_COMPUTE_ERROR("Data type not supported.");
924 break;
925 }
Gian Marco Iodice41acb762018-08-23 10:25:06 +0100926 break;
927 case 5:
928 {
929 switch(input->data_type())
930 {
931 case DataType::F32:
932 num_weight_elems_read_per_row = 4 + kernel_size - 1;
933 num_elems_read_per_iteration = 12;
934 num_elems_written_per_iteration = 16 >> conv_stride_x;
935 break;
936 default:
937 ARM_COMPUTE_ERROR("Data type not supported.");
938 break;
939 }
Giorgio Arenac0f54432018-03-16 14:02:34 +0000940 }
941 break;
942 default:
943 {
944 ARM_COMPUTE_ERROR("Not implemented");
945 break;
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000946 }
947 }
Giorgio Arenac0f54432018-03-16 14:02:34 +0000948
949 // Calculate right pad
950 int start_x = kernel_size / 2 - static_cast<int>(conv_info.pad_left());
951 int end_x = ceil_to_multiple(static_cast<int>(output->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;
952 int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width;
953
954 // Calculate border
955 const unsigned int conv_pad_left = conv_info.pad_left();
956 const unsigned int conv_pad_top = conv_info.pad_top();
957 const unsigned int conv_pad_right = std::max(upper_bound_w, 0);
958 const unsigned int conv_pad_bottom = conv_info.pad_bottom();
959
960 border_size.left = conv_pad_left;
961 border_size.top = conv_pad_top;
962 border_size.right = conv_pad_right;
963 border_size.bottom = conv_pad_bottom;
964
965 // Configure window
966 win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
967
968 AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top,
969 num_elems_read_per_iteration, kernel_size,
970 conv_stride_x, conv_stride_y);
971 AccessWindowStatic weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
972 AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
973 window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
974 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000975 }
Giorgio Arenac0f54432018-03-16 14:02:34 +0000976 else
977 {
Manuel Bottini87350f42020-09-15 13:03:34 +0100978 // Configure window NHWC without any padding
979 win = calculate_max_window(*output, Steps());
Giorgio Arenac0f54432018-03-16 14:02:34 +0000980 }
Michalis Spyrouafa5d812017-11-30 14:25:57 +0000981
982 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
983 return std::make_pair(err, win);
984}
Manuel Bottini87350f42020-09-15 13:03:34 +0100985
986bool have_zero_x_internal_padding(ITensorInfo *input, ITensorInfo *weights)
987{
988 return (input->padding().left == 0 && weights->padding().left == 0 && input->padding().right == 0 && weights->padding().right == 0);
989}
990
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100991} // namespace
992
Manuel Bottini87350f42020-09-15 13:03:34 +0100993template <typename T>
994void NEDirectConvolutionLayerKernel::convolve_nhwc_optimized(const Window &window)
995{
996 // This function assumes that input and weights have not padding in channel
997
998 // Declare useful types
999 using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
1000 using vector_type = typename vtype::type;
1001 using tag_type = typename vtype::tag_type;
1002
1003 // Scalar quantities
1004 const int element_size = _input->info()->element_size();
1005 const int input_stride_w = _input->info()->strides_in_bytes().y() / element_size;
1006 const int input_stride_h = _input->info()->strides_in_bytes().z() / element_size;
1007 const int input_stride_n = _input->info()->strides_in_bytes()[3] / element_size;
1008 const int input_dim_w = _input->info()->dimension(1);
1009 const int input_dim_h = _input->info()->dimension(2);
1010
1011 const int output_stride_c = _output->info()->strides_in_bytes().x();
1012
1013 const unsigned int kernel_stride_w = _weights->info()->strides_in_bytes().y() / element_size;
1014 const unsigned int kernel_stride_h = _weights->info()->strides_in_bytes().z() / element_size;
1015 const int kernel_dim_w = _weights->info()->dimension(1);
1016 const int kernel_dim_h = _weights->info()->dimension(2);
1017
1018 const int conv_pad_top = _conv_info.pad_top();
1019 const int conv_pad_left = _conv_info.pad_left();
1020 const int conv_stride_w = std::get<0>(_conv_info.stride());
1021 const int conv_stride_h = std::get<1>(_conv_info.stride());
1022
1023 // Setup input window for the output iterator
1024 Window window_out = window;
1025 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1026
1027 // Setup input window for the weights iterator
1028 Window window_w = calculate_max_window(*_weights->info(), Steps());
1029 window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
1030 window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
1031 window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
1032
1033 Iterator out(_output, window_out);
1034 Iterator wei(_weights, window_w);
1035
1036 constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
1037 /*
1038 * This implementation parallelize the full WC plane of input and weights by
1039 * treating them as series of elements. So for example, a 3x3 weights and
1040 * floating point vector operations of 4 elements per time, the first 3
1041 * channel elements of the first row would be taken and additionally the first
1042 * element of the second row. The 9 elements in each single WC weight plane
1043 * would require 2 4-element vector operations and a last single element operation.
1044 *
1045 * This works since when we create the input vector to multiply with the weights,
1046 * the exact required elements are loaded in the same order. Therefore the
1047 * multiplication works on the correct input/weight elements.
1048 */
1049 execute_window_loop(window_out, [&](const Coordinates & id)
1050 {
1051 /*
1052 * In here we create theoretical indexes which then we validate for both
1053 * inputs and weights.
1054 * As a reminder, this loop take each output point in NHW, C is treated
1055 * in the weights loop.
1056 */
1057 // We are computing the theoretical starting input starting points
1058 const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
1059 const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
1060 const int in_w_end_t = in_w_start_t + kernel_dim_w;
1061 const int in_h_end_t = in_h_start_t + kernel_dim_h;
1062
1063 // We are computing the valid initial and ending input points by checking the borders
1064 const int in_w_start = std::max(in_w_start_t, 0);
1065 const int in_h_start = std::max(in_h_start_t, 0);
1066 const int in_w_end = std::min(in_w_end_t, input_dim_w);
1067 const int in_h_end = std::min(in_h_end_t, input_dim_h);
1068
1069 // We use the input points to select the valid weight points to use
1070 const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
1071 const int index_h_start = in_h_start - in_h_start_t;
1072 const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
1073 const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
1074
1075 execute_window_loop(window_w, [&](const Coordinates & id_w)
1076 {
1077 /*
1078 * This is the loop in the weights, and it goes along N (the batches)
1079 * As a reminder, the batches of the weights are translated into the
1080 * channels of the output
1081 */
1082 const T *in_ptr_row = reinterpret_cast<const T *>(_input->buffer() + _input->info()->offset_first_element_in_bytes())
1083 + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
1084 const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
1085 uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
1086
1087 T out_temp = static_cast<T>(0);
1088 for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
1089 {
1090 const T *in_ptr_mover = in_ptr_row;
1091 int index_wc = index_wc_start;
1092 vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
1093 for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
1094 {
1095 const auto src_vec = wrapper::vloadq(in_ptr_mover);
1096 const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc);
1097 out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
1098 }
1099 out_temp += vreduce(out_temp_vec);
1100 for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
1101 {
1102 const auto src_val = *(in_ptr_mover);
1103 const auto w_val = *(weights_ptr_row + index_wc);
1104 out_temp += src_val * w_val;
1105 }
1106 }
1107 *(reinterpret_cast<T *>(out_ptr)) = out_temp;
1108 },
1109 wei);
1110 },
1111 out);
1112}
1113
1114template <typename T>
1115void NEDirectConvolutionLayerKernel::convolve_nhwc(const Window &window)
1116{
1117 // Declare useful types
1118 using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
1119 using vector_type = typename vtype::type;
1120 using tag_type = typename vtype::tag_type;
1121
1122 // Scalar quantities
1123 const int element_size = _input->info()->element_size();
1124 const int input_stride_w = _input->info()->strides_in_bytes().y() / element_size;
1125 const int input_stride_h = _input->info()->strides_in_bytes().z() / element_size;
1126 const int input_stride_n = _input->info()->strides_in_bytes()[3] / element_size;
1127 const int input_dim_w = _input->info()->dimension(1);
1128 const int input_dim_h = _input->info()->dimension(2);
1129
1130 const int output_stride_c = _output->info()->strides_in_bytes().x();
1131
1132 const unsigned int kernel_stride_w = _weights->info()->strides_in_bytes().y() / element_size;
1133 const unsigned int kernel_stride_h = _weights->info()->strides_in_bytes().z() / element_size;
1134 const int kernel_dim_w = _weights->info()->dimension(1);
1135 const int kernel_dim_h = _weights->info()->dimension(2);
1136
1137 const int conv_pad_top = _conv_info.pad_top();
1138 const int conv_pad_left = _conv_info.pad_left();
1139 const int conv_stride_w = std::get<0>(_conv_info.stride());
1140 const int conv_stride_h = std::get<1>(_conv_info.stride());
1141
1142 // Setup input window for the output iterator
1143 Window window_out = window;
1144 window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1145
1146 // Setup input window for the weights iterator
1147 Window window_w = calculate_max_window(*_weights->info(), Steps());
1148 window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
1149 window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
1150 window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
1151
1152 Iterator out(_output, window_out);
1153 Iterator wei(_weights, window_w);
1154
1155 constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
1156
1157 execute_window_loop(window_out, [&](const Coordinates & id)
1158 {
1159 // We are computing the theoretical starting input starting points
1160 const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
1161 const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
1162 const int in_w_end_t = in_w_start_t + kernel_dim_w;
1163 const int in_h_end_t = in_h_start_t + kernel_dim_h;
1164
1165 // We are computing the valid initial and ending input points by checking the borders
1166 const int in_w_start = std::max(in_w_start_t, 0);
1167 const int in_h_start = std::max(in_h_start_t, 0);
1168 const int in_w_end = std::min(in_w_end_t, input_dim_w);
1169 const int in_h_end = std::min(in_h_end_t, input_dim_h);
1170
1171 // We use the input points to select the valid weight points to use
1172 const int wei_w_start = in_w_start - in_w_start_t;
1173 const int wei_h_start = in_h_start - in_h_start_t;
1174 const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end);
1175 const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
1176
1177 const int index_c_end = _weights->info()->dimension(0);
1178 const T *const in_ptr_start = reinterpret_cast<const T *>(_input->buffer() + _input->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
1179
1180 execute_window_loop(window_w, [&](const Coordinates & id_w)
1181 {
1182 const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
1183 uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
1184
1185 T out_temp = static_cast<T>(0);
1186 for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
1187 {
1188 const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h;
1189 const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
1190 for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
1191 {
1192 const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w;
1193 const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
1194 int index_c = 0;
1195 vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
1196 for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
1197 {
1198 const auto src_vec = wrapper::vloadq(in_ptr_mover);
1199 const auto w_vec = wrapper::vloadq(weights_ptr_mover);
1200 out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
1201 }
1202 out_temp += vreduce(out_temp_vec);
1203 for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
1204 {
1205 const auto src_val = *(in_ptr_mover);
1206 const auto w_val = *(weights_ptr_mover);
1207 out_temp += src_val * w_val;
1208 }
1209 }
1210 }
1211 *(reinterpret_cast<T *>(out_ptr)) = out_temp;
1212 },
1213 wei);
1214 },
1215 out);
1216}
1217
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001218NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel()
Georgios Pinitas898a8062017-09-12 19:19:12 +01001219 : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_weight_elems_read_per_row(0), _num_elems_read_per_iteration(0),
Manuel Bottinica62c6f2021-03-23 11:50:34 +00001220 _num_elems_written_per_iteration(0), _data_layout()
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001221{
1222}
1223
1224BorderSize NEDirectConvolutionLayerKernel::border_size() const
1225{
1226 return _border_size;
1227}
1228
1229void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
1230{
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001231 ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001232
1233 _input = input;
1234 _weights = weights;
1235 _output = output;
1236 _conv_info = conv_info;
Manuel Bottinica62c6f2021-03-23 11:50:34 +00001237 _data_layout = _input->info()->data_layout();
1238 _kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
Michalis Spyrou621965e2018-01-08 17:11:26 +00001239
1240 const unsigned int conv_pad_left = conv_info.pad_left();
1241 const unsigned int conv_pad_top = conv_info.pad_top();
1242 const unsigned int conv_pad_right = conv_info.pad_right();
1243 const unsigned int conv_pad_bottom = conv_info.pad_bottom();
Manuel Bottinica62c6f2021-03-23 11:50:34 +00001244 if(_data_layout == DataLayout::NCHW)
Manuel Bottini87350f42020-09-15 13:03:34 +01001245 {
1246 _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
1247 }
1248 else
1249 {
1250 _border_size = BorderSize(0);
1251 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001252
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +01001253 // Get convolved dimensions
Giorgio Arenac0f54432018-03-16 14:02:34 +00001254 TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +01001255
1256 DataType data_type = input->info()->data_type();
1257
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +01001258 // Output auto inizialitation if not yet initialized
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +01001259 auto_init_if_empty(*output->info(), output_shape, 1, data_type);
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +01001260
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001261 // Perform validation step
1262 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info));
Gian Marco Iodice5cb4d6a2017-08-08 10:53:00 +01001263
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001264 // Configure kernel window
1265 auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, _num_weight_elems_read_per_row,
Georgios Pinitas0223a782017-12-12 11:44:44 +00001266 _num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size);
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001267 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
1268 INEKernel::configure(win_config.second);
1269}
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001270
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001271Status NEDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)
1272{
1273 unsigned int num_weight_elems_read_per_row = 0;
1274 unsigned int num_elems_read_per_iteration = 0;
1275 unsigned int num_elems_written_per_iteration = 0;
Georgios Pinitas15997872018-02-19 13:58:22 +00001276 BorderSize border_size = {};
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001277 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info));
Georgios Pinitas0223a782017-12-12 11:44:44 +00001278 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
1279 weights->clone().get(),
1280 output->clone().get(),
1281 conv_info,
1282 num_weight_elems_read_per_row,
1283 num_elems_read_per_iteration,
1284 num_elems_written_per_iteration,
1285 border_size)
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001286 .first);
Georgios Pinitas898a8062017-09-12 19:19:12 +01001287
Michalis Spyrouafa5d812017-11-30 14:25:57 +00001288 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001289}
1290
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001291void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001292{
Moritz Pflanzerc186b572017-09-07 09:48:04 +01001293 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001294 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
1295 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
1296 ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
1297
Manuel Bottinica62c6f2021-03-23 11:50:34 +00001298 const int kernel_size = _weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001299
Manuel Bottinica62c6f2021-03-23 11:50:34 +00001300 if(_data_layout == DataLayout::NCHW)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001301 {
Giorgio Arenac0f54432018-03-16 14:02:34 +00001302 switch(kernel_size)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001303 {
Giorgio Arenac0f54432018-03-16 14:02:34 +00001304 case 1:
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001305 {
Giorgio Arenac0f54432018-03-16 14:02:34 +00001306 switch(_input->info()->data_type())
1307 {
Giorgio Arenac0f54432018-03-16 14:02:34 +00001308 case DataType::F32:
1309 convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
1310 break;
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001311#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Giorgio Arenac0f54432018-03-16 14:02:34 +00001312 case DataType::F16:
1313 convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
1314 break;
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001315#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Giorgio Arenac0f54432018-03-16 14:02:34 +00001316 default:
1317 ARM_COMPUTE_ERROR("Data type not supported");
1318 break;
1319 }
1320 break;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001321 }
Giorgio Arenac0f54432018-03-16 14:02:34 +00001322 case 3:
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001323 {
Giorgio Arenac0f54432018-03-16 14:02:34 +00001324 switch(_input->info()->data_type())
1325 {
Giorgio Arenac0f54432018-03-16 14:02:34 +00001326 case DataType::F32:
1327 convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
1328 break;
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001329#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Giorgio Arenac0f54432018-03-16 14:02:34 +00001330 case DataType::F16:
1331 convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
1332 break;
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +00001333#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Giorgio Arenac0f54432018-03-16 14:02:34 +00001334 default:
1335 ARM_COMPUTE_ERROR("Data type not supported");
1336 break;
1337 }
1338 break;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001339 }
Giorgio Arenac0f54432018-03-16 14:02:34 +00001340 case 5:
Pablo Tello06da39d2017-08-10 15:10:40 +01001341 {
Giorgio Arenac0f54432018-03-16 14:02:34 +00001342 switch(_input->info()->data_type())
1343 {
1344 case DataType::F32:
1345 convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
1346 break;
1347 default:
1348 ARM_COMPUTE_ERROR("Data type not supported");
1349 break;
1350 }
1351 break;
Pablo Tello06da39d2017-08-10 15:10:40 +01001352 }
Giorgio Arenac0f54432018-03-16 14:02:34 +00001353 default:
1354 {
1355 ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
1356 break;
1357 }
1358 }
1359 }
1360 else
1361 {
1362 switch(_input->info()->data_type())
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001363 {
Giorgio Arenac0f54432018-03-16 14:02:34 +00001364 case DataType::F32:
Gian Marco Iodice95f93612019-06-13 15:58:32 +01001365 {
Manuel Bottini87350f42020-09-15 13:03:34 +01001366 if(have_zero_x_internal_padding(_input->info(), _weights->info()))
Gian Marco Iodice95f93612019-06-13 15:58:32 +01001367 {
Manuel Bottini87350f42020-09-15 13:03:34 +01001368 convolve_nhwc_optimized<float>(window);
Gian Marco Iodice95f93612019-06-13 15:58:32 +01001369 }
1370 else
1371 {
Manuel Bottini87350f42020-09-15 13:03:34 +01001372 convolve_nhwc<float>(window);
Gian Marco Iodice95f93612019-06-13 15:58:32 +01001373 }
Giorgio Arenac0f54432018-03-16 14:02:34 +00001374 break;
Gian Marco Iodice95f93612019-06-13 15:58:32 +01001375 }
Giorgio Arenac0f54432018-03-16 14:02:34 +00001376 default:
1377 ARM_COMPUTE_ERROR("Data type not supported");
1378 break;
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001379 }
1380 }
1381}
Sheri Zhangac6499a2021-02-10 15:32:38 +00001382} // namespace arm_compute