blob: b2cebc4230679133d75b6392414fa69e48c3c114 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Teresa Charlind1dc09c2021-03-04 15:24:45 +00002 * Copyright (c) 2021 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Georgios Pinitas7891a732021-08-20 21:39:25 +010024#include "src/cpu/kernels/CpuTransposeKernel.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010025
Anthony Barbier6ff3b192017-09-04 18:44:23 +010026#include "arm_compute/core/Error.h"
27#include "arm_compute/core/Helpers.h"
28#include "arm_compute/core/ITensor.h"
Isabella Gottardid56e7702018-02-28 14:29:36 +000029#include "arm_compute/core/TensorInfo.h"
Teresa Charlind1dc09c2021-03-04 15:24:45 +000030#include "arm_compute/core/Types.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010031#include "arm_compute/core/Validate.h"
Teresa Charlind1dc09c2021-03-04 15:24:45 +000032#include "arm_compute/core/utils/misc/ShapeCalculator.h"
Sang-Hoon Park68dd25f2020-10-19 16:00:11 +010033#include "src/core/helpers/AutoConfiguration.h"
34#include "src/core/helpers/WindowHelpers.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010035
36#include <arm_neon.h>
37
Anthony Barbier6ff3b192017-09-04 18:44:23 +010038namespace arm_compute
39{
Teresa Charlind1dc09c2021-03-04 15:24:45 +000040namespace cpu
41{
42namespace kernels
43{
Anthony Barbier6ff3b192017-09-04 18:44:23 +010044namespace
45{
Michalis Spyrou0b1452d2020-02-27 16:20:19 +000046unsigned int num_elems_processed(size_t element_size)
Gian Marco7c435f22017-12-05 16:17:23 +000047{
Michalis Spyrou0b1452d2020-02-27 16:20:19 +000048 switch(element_size)
Gian Marco7c435f22017-12-05 16:17:23 +000049 {
Michalis Spyrou0b1452d2020-02-27 16:20:19 +000050 case 1:
51 return 8;
52 case 2:
Michalis Spyrou0b1452d2020-02-27 16:20:19 +000053 return 4;
Ethan Doea07c01b2023-04-14 17:24:33 +000054 case 4:
55#ifdef __aarch64__
56 return 8;
57#else // __aarch64__
58 return 4;
59#endif // __aarch64__
Michalis Spyrou0b1452d2020-02-27 16:20:19 +000060 default:
61 break;
Gian Marco7c435f22017-12-05 16:17:23 +000062 }
63
Michalis Spyrou0b1452d2020-02-27 16:20:19 +000064 ARM_COMPUTE_ERROR("Element size not supported");
Gian Marco7c435f22017-12-05 16:17:23 +000065}
66
Anthony Barbier6ff3b192017-09-04 18:44:23 +010067void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &window)
68{
Gian Marcob42d53c2017-12-07 10:09:07 +000069 const int window_step_x = 8;
70 const int window_step_y = 8;
71 const int window_start_x = window.x().start();
72 const int window_end_x = window.x().end();
73 const int window_start_y = window.y().start();
74 const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
75 const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
76 const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
77 const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
78
79 // Check if we need a left-over loop for the y dimension
80 bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
81
82 Window window_in(window);
83 window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
84 if(left_over_loop_y)
85 {
86 // Check if window_end_y_multiple_of is greater than window_start_y
87 if(window_end_y_multiple_of > window_start_y)
88 {
89 window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
90 }
91 else
92 {
93 window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
94 }
95 }
96
Anthony Barbier6ff3b192017-09-04 18:44:23 +010097 Window window_out(window);
98 window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
99 window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
100
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100101 Iterator output(out, window_out);
102
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000103 // Run the SIMD path if and only if the input is not a row-vector
Gian Marcob42d53c2017-12-07 10:09:07 +0000104 if(in->info()->dimension(1) != 1)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100105 {
Gian Marcob42d53c2017-12-07 10:09:07 +0000106 Iterator input(in, window_in);
107 execute_window_loop(window_in, [&](const Coordinates & id)
108 {
109 // Compute 8x8 elements per iteration
110 int x = window_start_x;
111 for(; x <= (window_end_x - window_step_x); x += window_step_x)
112 {
113 const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
114 const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
115 const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
116 const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
117 const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
118 const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
119 const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
120 const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100121
Gian Marcob42d53c2017-12-07 10:09:07 +0000122 // Transpose 2x2
123 const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
124 const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
125 const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
126 const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100127
Gian Marcob42d53c2017-12-07 10:09:07 +0000128 // Transpose 4x4
129 const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
130 const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
131 const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
132 const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100133
Gian Marcob42d53c2017-12-07 10:09:07 +0000134 // Transpose 8x8
135 const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
136 const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
137 const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
138 const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100139
Gian Marcob42d53c2017-12-07 10:09:07 +0000140 // Compute destination address
141 const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100142
Gian Marcob42d53c2017-12-07 10:09:07 +0000143 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
144 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
145 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
146 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
147 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
148 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
149 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
150 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
151 }
152
153 // Compute left-over elements along the x dimension (1x8)
154 for(; x < window_end_x; ++x)
155 {
156 const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
157 const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
158 const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
159 const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
160 const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
161 const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
162 const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
163 const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
164
165 uint8x8_t result = vdup_n_u8(0);
166 result = vset_lane_u8(val0, result, 0);
167 result = vset_lane_u8(val1, result, 1);
168 result = vset_lane_u8(val2, result, 2);
169 result = vset_lane_u8(val3, result, 3);
170 result = vset_lane_u8(val4, result, 4);
171 result = vset_lane_u8(val5, result, 5);
172 result = vset_lane_u8(val6, result, 6);
173 result = vset_lane_u8(val7, result, 7);
174
175 // Compute destination address
176 const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
177
178 vst1_u8(output.ptr() + dst_offset_in_bytes, result);
179 }
180 },
181 input, output);
182 }
183
184 if(left_over_loop_y)
185 {
186 window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
187 window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
188
189 Iterator input(in, window_in);
190 Iterator output(out, window_out);
191
192 // Compute left-over elements along the y dimension (1x1)
193 execute_window_loop(window_in, [&](const Coordinates & id)
194 {
195 const uint8_t val0 = *input.ptr();
196
197 // Compute destination address
198 const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
199
200 *(output.ptr() + dst_offset_in_bytes) = val0;
201 },
202 input, output);
203 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100204}
205
206void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &window)
207{
Gian Marcob42d53c2017-12-07 10:09:07 +0000208 const int window_step_x = 4;
209 const int window_step_y = 4;
210 const int window_start_x = window.x().start();
211 const int window_end_x = window.x().end();
212 const int window_start_y = window.y().start();
213 const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
214 const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
215 const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
216 const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
217
218 // Check if we need a left-over loop for the y dimension
219 bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
220
221 Window window_in(window);
222 window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
223 if(left_over_loop_y)
224 {
225 // Check if window_end_y_multiple_of is greater than window_start_y
226 if(window_end_y_multiple_of > window_start_y)
227 {
228 window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
229 }
230 else
231 {
232 window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
233 }
234 }
235
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100236 Window window_out(window);
237 window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
238 window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
239
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100240 Iterator output(out, window_out);
241
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000242 // Run the SIMD path if and only if the input is not a row-vector
Gian Marcob42d53c2017-12-07 10:09:07 +0000243 if(in->info()->dimension(1) != 1)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100244 {
Gian Marcob42d53c2017-12-07 10:09:07 +0000245 Iterator input(in, window_in);
246 execute_window_loop(window_in, [&](const Coordinates & id)
247 {
248 // Compute 4x4 elements per iteration
249 int x = window_start_x;
250 for(; x <= (window_end_x - window_step_x); x += window_step_x)
251 {
252 const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
253 const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
254 const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
255 const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100256
Gian Marcob42d53c2017-12-07 10:09:07 +0000257 // Transpose 2x2
258 const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
259 const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100260
Gian Marcob42d53c2017-12-07 10:09:07 +0000261 // Transpose 4x4
262 const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
263 const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100264
Gian Marcob42d53c2017-12-07 10:09:07 +0000265 // Compute destination address
266 const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100267
Gian Marcob42d53c2017-12-07 10:09:07 +0000268 vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));
269 vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));
270 vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));
271 vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));
272 }
273
274 // Compute left-over elements (1x4)
275 for(; x < window_end_x; ++x)
276 {
277 const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
278 const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
279 const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
280 const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
281
282 uint16x4_t result = vdup_n_u16(0);
283 result = vset_lane_u16(val0, result, 0);
284 result = vset_lane_u16(val1, result, 1);
285 result = vset_lane_u16(val2, result, 2);
286 result = vset_lane_u16(val3, result, 3);
287
288 // Compute destination address
289 const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
290
291 vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
292 }
293 },
294 input, output);
295 }
296
297 if(left_over_loop_y)
298 {
299 window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
300 window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
301
302 Iterator input(in, window_in);
303 Iterator output(out, window_out);
304
305 // Compute left-over elements along the y dimension (1x1)
306 execute_window_loop(window_in, [&](const Coordinates & id)
307 {
308 const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));
309
310 // Compute destination address
311 const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
312
313 *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
314 },
315 input, output);
316 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100317}
318
Ethan Doea07c01b2023-04-14 17:24:33 +0000319#ifdef __aarch64__
320inline uint32x4x2_t vld1q_u32_x2_(const uint32_t *ptr)
321{
322 // gcc-7 doesn't support vld1q_u32_x2 instruction
323 return {vld1q_u32(ptr), vld1q_u32(ptr + 4)};
324}
325
326inline void vst1q_u32_x2_(const uint32_t *ptr, const uint32x4x2_t &val)
327{
328 // gcc-7 doesn't support vst1q_u32_x2 instruction
329 vst1q_u32(const_cast<uint32_t *>(ptr), val.val[0]);
330 vst1q_u32(const_cast<uint32_t *>(ptr + 4), val.val[1]);
331}
332
333void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)
334{
335 constexpr int window_step_x = 8;
336 constexpr int window_step_y = 8;
337 const int window_start_x = window.x().start();
338 const int window_end_x = window.x().end();
339 const int window_start_y = window.y().start();
340 const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
341 const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
342 const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
343 const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
344
345 // Check if we need a left-over loop for the y dimension
346 bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
347
348 Window window_in(window);
349 window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
350 if(left_over_loop_y)
351 {
352 // Check if window_end_y_multiple_of is greater than window_start_y
353 if(window_end_y_multiple_of > window_start_y)
354 {
355 window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
356 }
357 else
358 {
359 window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
360 }
361 }
362
363 Window window_out(window);
364 window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
365 window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
366
367 Iterator output(out, window_out);
368
369 // Run the SIMD path if and only if the input is not a row-vector
370 if(in->info()->dimension(1) != 1)
371 {
372 Iterator input(in, window_in);
373 execute_window_loop(window_in, [&](const Coordinates & id)
374 {
375 // Compute 8x8 elements per iteration
376 int x = window_start_x;
377 for(; x <= (window_end_x - window_step_x); x += window_step_x)
378 {
379 // Load
380 const uint32x4x2_t row0 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
381 const uint32x4x2_t row1 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
382 const uint32x4x2_t row2 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
383 const uint32x4x2_t row3 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
384 const uint32x4x2_t row4 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
385 const uint32x4x2_t row5 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
386 const uint32x4x2_t row6 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
387 const uint32x4x2_t row7 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
388
389 // Transpose 2x4
390 const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]), vtrn2q_u32(row0.val[0], row1.val[0])};
391 const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]), vtrn2q_u32(row0.val[1], row1.val[1])};
392 const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]), vtrn2q_u32(row2.val[0], row3.val[0])};
393 const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]), vtrn2q_u32(row2.val[1], row3.val[1])};
394 const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]), vtrn2q_u32(row4.val[0], row5.val[0])};
395 const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]), vtrn2q_u32(row4.val[1], row5.val[1])};
396 const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]), vtrn2q_u32(row6.val[0], row7.val[0])};
397 const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]), vtrn2q_u32(row6.val[1], row7.val[1])};
398
399 // Transpose 2x2
400 const uint64x2x2_t k0_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))};
401 const uint64x2x2_t k1_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))};
402 const uint64x2x2_t k2_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))};
403 const uint64x2x2_t k3_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))};
404 const uint64x2x2_t k4_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))};
405 const uint64x2x2_t k5_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))};
406 const uint64x2x2_t k6_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))};
407 const uint64x2x2_t k7_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))};
408
409 // Swap blocks
410 const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]), vreinterpretq_u32_u64(k4_u64.val[0])};
411 const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]), vreinterpretq_u32_u64(k5_u64.val[0])};
412 const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]), vreinterpretq_u32_u64(k4_u64.val[1])};
413 const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]), vreinterpretq_u32_u64(k5_u64.val[1])};
414 const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]), vreinterpretq_u32_u64(k6_u64.val[0])};
415 const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]), vreinterpretq_u32_u64(k7_u64.val[0])};
416 const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]), vreinterpretq_u32_u64(k6_u64.val[1])};
417 const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]), vreinterpretq_u32_u64(k7_u64.val[1])};
418
419 // Compute destination address
420 const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
421
422 // Store
423 vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), col0);
424 vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), col1);
425 vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), col2);
426 vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), col3);
427 vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), col4);
428 vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), col5);
429 vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), col6);
430 vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), col7);
431 }
432
433 // Compute left-over elements (8x1)
434 for(; x < window_end_x; ++x)
435 {
436 const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
437 const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
438 const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
439 const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
440 const uint32_t val4 = *(reinterpret_cast<uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
441 const uint32_t val5 = *(reinterpret_cast<uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
442 const uint32_t val6 = *(reinterpret_cast<uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
443 const uint32_t val7 = *(reinterpret_cast<uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
444
445 uint32x4_t result0 = vdupq_n_u32(0);
446 uint32x4_t result1 = vdupq_n_u32(0);
447 result0 = vsetq_lane_u32(val0, result0, 0);
448 result0 = vsetq_lane_u32(val1, result0, 1);
449 result0 = vsetq_lane_u32(val2, result0, 2);
450 result0 = vsetq_lane_u32(val3, result0, 3);
451 result1 = vsetq_lane_u32(val4, result1, 0);
452 result1 = vsetq_lane_u32(val5, result1, 1);
453 result1 = vsetq_lane_u32(val6, result1, 2);
454 result1 = vsetq_lane_u32(val7, result1, 3);
455
456 // Compute destination address
457 const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
458
459 vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), {result0, result1});
460 }
461 },
462 input, output);
463 }
464
465 if(left_over_loop_y)
466 {
467 window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
468 window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
469
470 Iterator input(in, window_in);
471 Iterator output(out, window_out);
472
473 // Compute left-over elements along the y dimension (1x1)
474 execute_window_loop(window_in, [&](const Coordinates & id)
475 {
476 const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
477
478 // Compute destination address
479 const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
480
481 *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
482 },
483 input, output);
484 }
485}
486#else // __aarch64__
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100487void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)
488{
Gian Marcob42d53c2017-12-07 10:09:07 +0000489 const int window_step_x = 4;
490 const int window_step_y = 4;
491 const int window_start_x = window.x().start();
492 const int window_end_x = window.x().end();
493 const int window_start_y = window.y().start();
494 const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
495 const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
496 const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
497 const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
498
499 // Check if we need a left-over loop for the y dimension
500 bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
501
502 Window window_in(window);
503 window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
504 if(left_over_loop_y)
505 {
506 // Check if window_end_y_multiple_of is greater than window_start_y
507 if(window_end_y_multiple_of > window_start_y)
508 {
509 window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
510 }
511 else
512 {
513 window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
514 }
515 }
516
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100517 Window window_out(window);
518 window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
519 window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
520
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100521 Iterator output(out, window_out);
522
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000523 // Run the SIMD path if and only if the input is not a row-vector
Gian Marcob42d53c2017-12-07 10:09:07 +0000524 if(in->info()->dimension(1) != 1)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100525 {
Gian Marcob42d53c2017-12-07 10:09:07 +0000526 Iterator input(in, window_in);
527 execute_window_loop(window_in, [&](const Coordinates & id)
528 {
529 // Compute 4x4 elements per iteration
530 int x = window_start_x;
531 for(; x <= (window_end_x - window_step_x); x += window_step_x)
532 {
533 const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
534 const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
535 const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
536 const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100537
Gian Marcob42d53c2017-12-07 10:09:07 +0000538 // Transpose 2x2
539 const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
540 const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
541 const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
542 const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100543
Gian Marcob42d53c2017-12-07 10:09:07 +0000544 // Compute destination address
545 const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100546
Gian Marcob42d53c2017-12-07 10:09:07 +0000547 // Swap block 01 with block 10 and store
548 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
549 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
550 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
551 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
552 }
553
554 // Compute left-over elements (1x4)
555 for(; x < window_end_x; ++x)
556 {
557 const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
558 const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
559 const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
560 const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
561
562 uint32x4_t result = vdupq_n_u32(0);
563 result = vsetq_lane_u32(val0, result, 0);
564 result = vsetq_lane_u32(val1, result, 1);
565 result = vsetq_lane_u32(val2, result, 2);
566 result = vsetq_lane_u32(val3, result, 3);
567
568 // Compute destination address
569 const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
570
571 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
572 }
573 },
574 input, output);
575 }
576
577 if(left_over_loop_y)
578 {
579 window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
580 window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
581
582 Iterator input(in, window_in);
583 Iterator output(out, window_out);
584
585 // Compute left-over elements along the y dimension (1x1)
586 execute_window_loop(window_in, [&](const Coordinates & id)
587 {
588 const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
589
590 // Compute destination address
591 const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
592
593 *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
594 },
595 input, output);
596 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100597}
Ethan Doea07c01b2023-04-14 17:24:33 +0000598#endif // __aarch64__
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100599} // namespace
600
Teresa Charlind1dc09c2021-03-04 15:24:45 +0000601void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
Gian Marco7c435f22017-12-05 16:17:23 +0000602{
Teresa Charlind1dc09c2021-03-04 15:24:45 +0000603 ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
Gian Marco7c435f22017-12-05 16:17:23 +0000604
Teresa Charlind1dc09c2021-03-04 15:24:45 +0000605 // Destination auto inizialitation if not yet initialized
606 const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
607 auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100608
Teresa Charlind1dc09c2021-03-04 15:24:45 +0000609 // Perform validation step
610 ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100611
Michalis Spyrou0b1452d2020-02-27 16:20:19 +0000612 // Note: This kernel performs 16 elements per iteration.
613 // However, since we use a left-over for loop on both dimensions (X and Y), we cannot have any read or write out of memory
614 // For this reason num_elems_processed_per_iteration_x is set to 1
615 const unsigned int num_elems_processed_per_iteration_x = 1;
Teresa Charlind1dc09c2021-03-04 15:24:45 +0000616 const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->element_size());
Michalis Spyrou0b1452d2020-02-27 16:20:19 +0000617
618 // Configure kernel window
Teresa Charlind1dc09c2021-03-04 15:24:45 +0000619 Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
Michalis Spyrou0b1452d2020-02-27 16:20:19 +0000620
Teresa Charlind1dc09c2021-03-04 15:24:45 +0000621 // The CpuTranspose doesn't need padding so update_window_and_padding() can be skipped
622 Coordinates coord;
623 coord.set_num_dimensions(dst->num_dimensions());
624 dst->set_valid_region(ValidRegion(coord, dst->tensor_shape()));
625
626 ICpuKernel::configure(win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100627}
628
Teresa Charlind1dc09c2021-03-04 15:24:45 +0000629Status CpuTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
630{
631 ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
Michele Di Giorgio33f41fa2021-03-09 14:09:08 +0000632 //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
Teresa Charlind1dc09c2021-03-04 15:24:45 +0000633 ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
634
635 // Error if input is not 8 bit, 16bit or 32bit
636 ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->element_size() != 1 && src->element_size() != 2 && src->element_size() != 4,
637 "Element size not supported");
638
639 // Validate configured destination
640 if(dst->total_size() != 0)
641 {
642 const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
643
644 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
645 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
646 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
647 }
648
649 return Status{};
650}
651
652void CpuTransposeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100653{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100654 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100655 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
Teresa Charlind1dc09c2021-03-04 15:24:45 +0000656 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100657
Teresa Charlind1dc09c2021-03-04 15:24:45 +0000658 const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
659 auto dst = tensors.get_tensor(TensorType::ACL_DST);
660
661 switch(src->info()->element_size())
662 {
663 case 1:
664 transpose_8bit_elements(src, dst, window);
665 break;
666 case 2:
667 transpose_16bit_elements(src, dst, window);
668 break;
669 case 4:
670 transpose_32bit_elements(src, dst, window);
671 break;
672 default:
673 ARM_COMPUTE_ERROR("Element size not supported");
674 break;
675 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100676}
Teresa Charlind1dc09c2021-03-04 15:24:45 +0000677
678const char *CpuTransposeKernel::name() const
679{
680 return "CpuTransposeKernel";
681}
682} // namespace kernels
683} // namespace cpu
684} // namespace arm_compute