blob: 246039888028ac7335909bfc7a21d5d865c6701a [file] [log] [blame]
ramelg01c827e992022-04-08 03:52:28 +01001/*
Michael Tyler74921ee2023-04-12 17:43:17 +01002 * Copyright (c) 2022-2023 Arm Limited.
ramelg01c827e992022-04-08 03:52:28 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "addressing.hpp"
Michael Tyler74921ee2023-04-12 17:43:17 +010026#include "utils.hpp"
27#include <algorithm>
ramelg01c827e992022-04-08 03:52:28 +010028#include <cstring>
29
30namespace arm_conv {
31namespace addressing {
32
33void fill_pointer_array(
34 size_t element_size,
35 void **dest_raw, const unsigned int array_rows, const unsigned int array_cols,
36 void *base_ptr_raw, size_t ld_row, size_t ld_col,
37 void *pad_buffer_raw,
38 const unsigned int pad_top, const unsigned int valid_rows,
39 const unsigned int pad_left, const unsigned int valid_cols
40)
41{
42 auto dest = reinterpret_cast<char **>(dest_raw);
43 auto base_ptr = reinterpret_cast<char *>(base_ptr_raw);
44 auto pad_buffer = reinterpret_cast<char *>(pad_buffer_raw);
45 ld_row *= element_size;
46 ld_col *= element_size;
47
48 const auto last_valid_row = std::min(pad_top + valid_rows, array_rows);
49 const auto last_valid_col = std::min(pad_left + valid_cols, array_cols);
50
51 unsigned int i = 0;
52 for (; i < pad_top; i++)
53 {
54 for (unsigned int j = 0; j < array_cols; j++)
55 {
56 *(dest++) = pad_buffer;
57 }
58 }
59 for (; i < last_valid_row; i++)
60 {
61 unsigned int j = 0;
62 auto colptr = base_ptr;
63 base_ptr += ld_row;
64
65 for (; j < pad_left; j++)
66 {
67 *(dest++) = pad_buffer;
68 }
69 for (; j < last_valid_col; j++)
70 {
71 *(dest++) = colptr;
72 colptr += ld_col;
73 }
74 for (; j < array_cols; j++)
75 {
76 *(dest++) = pad_buffer;
77 }
78 }
79 for (; i < array_rows; i++)
80 {
81 for (unsigned int j = 0; j < array_cols; j++)
82 {
83 *(dest++) = pad_buffer;
84 }
85 }
86}
87
88
89void fill_pointer_array_generic_kernel(
90 const size_t element_size,
91 void **dest_raw,
92 const unsigned int output_rows, const unsigned int output_cols,
93 const unsigned int kernel_rows, const unsigned int kernel_cols,
94 const unsigned int stride_rows, const unsigned int stride_cols,
95 void *base_ptr_raw, size_t ld_row, size_t ld_col,
96 void *pad_buffer_raw,
97 const unsigned int pad_top, const unsigned int valid_rows,
98 const unsigned int pad_left, const unsigned int valid_cols
99)
100{
101 auto dest = reinterpret_cast<char **>(dest_raw);
102 auto base_ptr = reinterpret_cast<char *>(base_ptr_raw);
103 auto pad_buffer = reinterpret_cast<char *>(pad_buffer_raw);
104 ld_row *= element_size;
105 ld_col *= element_size;
106
107 const auto last_valid_row = pad_top + valid_rows;
108 const auto last_valid_col = pad_left + valid_cols;
109 const auto point_stride = output_rows * output_cols;
110
111 // Iterate over the output points, after every point increment the pointer
112 // into the address array.
113 for (unsigned int oi = 0; oi < output_rows; oi++)
114 {
115 for (unsigned int oj = 0; oj < output_cols; oj++)
116 {
117 auto point_dest = dest;
118 dest++;
119
120 // Iterate over kernel points and fill in the pointer array.
121 unsigned int ki = 0, ii = oi*stride_rows;
122 for (; ii < pad_top && ki < kernel_rows; ii++, ki++)
123 {
124 // Fill with padding
125 for (unsigned int j = 0; j < kernel_cols; j++)
126 {
127 *point_dest = pad_buffer;
128 point_dest += point_stride;
129 }
130 }
131 for (; ii < last_valid_row && ki < kernel_rows; ii++, ki++)
132 {
133 unsigned int kj = 0, ij = oj*stride_cols;
134 for (; ij < pad_left && kj < kernel_cols; ij++, kj++)
135 {
136 // Padding
137 *point_dest = pad_buffer;
138 point_dest += point_stride;
139 }
140 for (; ij < last_valid_col && kj < kernel_cols; ij++, kj++)
141 {
142 *point_dest = base_ptr + (ii - pad_top)*ld_row + (ij - pad_left)*ld_col;
143 point_dest += point_stride;
144 }
145 for (; kj < kernel_cols; kj++)
146 {
147 // Padding
148 *point_dest = pad_buffer;
149 point_dest += point_stride;
150 }
151 }
152 for (; ki < kernel_rows; ki++)
153 {
154 // Fill with padding
155 for (unsigned int j = 0; j < kernel_cols; j++)
156 {
157 *point_dest = pad_buffer;
158 point_dest += point_stride;
159 }
160 }
161 }
162 }
163}
164
165/* Patch array constructor
166 *
167 * Some depthwise kernels require an NCHW-ordered patch of input. Here we
168 * construct such a patch, and fill in an array of pointers to the rows of the
169 * patch.
170 */
171void fill_nchw_patch_array(
172 size_t element_size,
173 const void **dest_row_pointers_raw, // Array of pointers to each row of the patch
174 void *dest_patch_raw, // Pointer to space which can be used to construct the patch
175 const unsigned int patch_rows, unsigned int patch_cols, // Patch size
176 const void *src_ptr_raw, size_t ld_row, size_t ld_col, // Source tensor
177 const void *pad_row, // Pointer to a row of padding values
178 const unsigned int pad_top, const unsigned int valid_rows,
179 const unsigned int pad_left, const unsigned int valid_cols
180)
181{
182 // Convert into more useful types
183 auto row_pointers = reinterpret_cast<const char **>(dest_row_pointers_raw);
184 auto dest_patch = reinterpret_cast<char *>(dest_patch_raw);
185 auto src = reinterpret_cast<const char *>(src_ptr_raw);
186 ld_row *= element_size;
187 ld_col *= element_size;
188
189 // Round up the patch columns to be a full quad
190 patch_cols = arm_gemm::roundup<unsigned int>(patch_cols, 16 / element_size);
191
192 const auto last_valid_row = std::min(pad_top + valid_rows, patch_rows);
193 const auto last_valid_col = std::min(pad_left + valid_cols, patch_cols);
194
195 // Construct the patch and row pointer array together
196 unsigned int i = 0;
197 for (; i < pad_top; i++)
198 {
199 // Insert pointers into the padding row
200 *(row_pointers++) = reinterpret_cast<const char *>(pad_row);
201 }
202 for (; i < last_valid_row; i++)
203 {
204 // Get a copy of the pointer for this row
205 auto colptr = src;
206 src += ld_row;
207
208 // If the input is already in NCHW format (ld_col == element_size) AND
209 // there is no padding, then we just use a pointer to the source tensor;
210 // otherwise we need to construct a patch and provide a pointer to it.
211 if (ld_col == element_size && pad_left == 0 && last_valid_col == patch_cols)
212 {
213 *(row_pointers++) = colptr;
214 }
215 else
216 {
217 auto patch_col = dest_patch;
218 *(row_pointers++) = dest_patch;
219 dest_patch += element_size * patch_cols; // Move the patch pointer on
220
221 // Construct the patch; fill the entirety with padding and then copy in
222 // the valid elements.
223 memcpy(patch_col, pad_row, element_size * patch_cols);
224 patch_col += pad_left * element_size; // Move over the left padding
225
226 if (ld_col == element_size)
227 {
228 // If the input is NCHW then copy across as many columns as we can.
229 memcpy(patch_col, colptr, (last_valid_col - pad_left) * element_size);
230 }
231 else
232 {
233 // If the input is NHWC then copy columns across in turn.
234 for (auto j = pad_left; j < last_valid_col; j++)
235 {
236 memcpy(patch_col, colptr, element_size); // Copy the valid element
237 patch_col += element_size; // Progress the patch destination
238 colptr += ld_col; // Progress the patch source
239 }
240 }
241 }
242 }
243 for (; i < patch_rows; i++)
244 {
245 // Insert pointers into the padding row
246 *(row_pointers++) = reinterpret_cast<const char *>(pad_row);
247 }
248}
249
250
251/* Patch array constructor (generic kernels)
252 *
253 * Construct an array of pointers; one pointer for each output row for each
254 * kernel point. Pointers should point at a whole number of QUADS containing an
255 * input point for each output point. If the kernel column stride is 1 and the
256 * data is NCHW then the input tensor might be addressed directly, otherwise a
257 * new patch sample might need to be constructed.
258 */
259void fill_patch_array_generic_kernel(
260 size_t element_size,
261 const void **dest_pointers_raw, // Pointers: one per output row per kernel point
262 void *patch_raw, // Pointer to space which can be used to construct the patch
263 const unsigned int output_rows, const unsigned int output_cols,
264 const unsigned int kernel_rows, const unsigned int kernel_cols,
265 const unsigned int stride_rows, const unsigned int stride_cols,
266 const void *src_ptr_raw, size_t ld_row, size_t ld_col, // Source tensor
267 const void *pad_row, // Pointer to a row of padding values
268 const unsigned int pad_top, const unsigned int valid_rows,
269 const unsigned int pad_left, const unsigned int valid_cols
270)
271{
272 auto dest = reinterpret_cast<const char **>(dest_pointers_raw);
273 auto patch = reinterpret_cast<char *>(patch_raw);
274 auto src_ptr = reinterpret_cast<const char *>(src_ptr_raw);
275 ld_row *= element_size;
276 ld_col *= element_size;
277
278 // Round up the patch columns to a multiple of quad-length
279 const auto patch_cols = arm_gemm::roundup<unsigned int>(output_cols, 16 / element_size);
280
281 const auto input_rows = kernel_rows + (output_rows - 1) * stride_rows;
282 const auto last_valid_row = std::min(pad_top + valid_rows, input_rows);
283
284 const auto input_cols = kernel_cols + (output_cols - 1) * stride_cols;
285 const auto last_valid_col = std::min(pad_left + valid_cols, input_cols);
286
287 for (auto ki = 0u; ki < kernel_rows; ki++)
288 {
289 for (auto kj = 0u; kj < kernel_cols; kj++)
290 {
291 auto oi = 0u, ii = ki;
292 for (; oi < output_rows && ii < pad_top; oi++, ii += stride_rows)
293 {
294 // Insert a pointer to the padding row
295 *(dest++) = reinterpret_cast<const char *>(pad_row);
296 }
297 for (; oi < output_rows && ii < last_valid_row; oi++, ii += stride_rows)
298 {
299 auto rowptr = src_ptr + (ii - pad_top) * ld_row;
300
301 // Construct a sample of the input here
302 auto patch_pos = patch;
303 *(dest++) = patch;
304 patch += patch_cols * element_size;
305
306 // Fill with padding
307 memcpy(patch_pos, pad_row, patch_cols * element_size);
308
309 // Fill in the valid elements
310 auto oj = 0u, ij = kj;
311 for (; oj < patch_cols && ij < pad_left; oj++, ij += stride_cols)
312 {
313 // Do nothing for padding
314 patch_pos += element_size;
315 }
316 for (; oj < patch_cols && ij < last_valid_col; oj++, ij += stride_cols)
317 {
318 // Copy from the source tensor
319 memcpy(patch_pos, rowptr + (ij - pad_left)*ld_col, element_size);
320 patch_pos += element_size;
321 }
322 // No action required for right-hand padding
323 }
324 for (; oi < output_rows; oi++)
325 {
326 *(dest++) = reinterpret_cast<const char *>(pad_row);
327 }
328 }
329 }
330}
331
332} // namespace addressing
333} // namespace arm_conv