blob: 9805fd354f5565eb30c9596496ddc795e1ed56e6 [file] [log] [blame]
ramelg018a164882022-04-07 02:42:52 +01001/*
Michael Tyler74921ee2023-04-12 17:43:17 +01002 * Copyright (c) 2022-2023 Arm Limited.
ramelg018a164882022-04-07 02:42:52 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25/* Depthwise kernel drivers commonly require a per-thread blob of working space
26 * in which to store parameters required by the depthwise implementations. The
27 * composition of this working space varies with the driver, kernel, and data
28 * types -- but the tasks of requesting sufficient space, allocating buffer
29 * space, and performing initialisation of the working space are common.
30 *
31 * The classes in this file consist of a number of working space "Elements"
32 * (which are logical units of functionality) and a Workspace type which allows
33 * for compile time composition of elements into a single working space type.
34 *
35 * Creating a workspace
36 * ====================
37 *
38 * A new workspace type can be created by combining Elements as an argument to
39 * the Workspace class. For instance:
40 *
41 * Workspace<
42 * depthwise_depthfirst::InputArrayElement<float>,
43 * InputBufferElement<float>,
44 * OutputArrayElement<float>
45 * >
46 *
47 * Creates a new Workspace consisting of the given elements. The workspace type
48 * contained within this class (`Workspace<...>::WorkspaceType`) is equivalent to:
49 *
50 * struct WorkspaceType
51 * {
52 * const float **inptr_array; // From InputArrayElement<float>
53 * float *input_buffer; // From InputBufferElement<float>
54 * float **outptr_array; // From OutputArrayElement<float>
55 * float *output_buffer; // From OutputArrayElement<float>
56 * };
57 *
58 * Calling `Workspace<...>::get_sizeof_workspace(...)` will return the amount
59 * of space required to store the above struct and the elements contained
60 * within it. Once this space has been allocated, the workspace can be
61 * initialised by calling `Workspace<...>::initialise` with a pointer to the
62 * buffer and the same arguments. This will place a struct of type
63 * `Workspace<...>::WorkspaceType` at the start of the buffer, and share the
64 * remaining space between the specified elements. As this is all done at
65 * compile time, later code can access elements from the `WorkspaceType` by
66 * name.
67 *
68 * Writing a new element
69 * =====================
70 *
71 * Each Element must provide:
72 * - A struct called "Workspace" containing the variables contained within
73 * this portion of the workspace.
74 * - A static method called `get_element_size` which returns the amount of
75 * buffer space required by this element of the workspace (NOT including the
76 * size of the Workspace struct). For example, an element which stores a
77 * vector of pointers will return the amount of space required top store the
78 * vector.
79 * - A static method called `initialise` which accepts a pointer to a struct
80 * which will be composed of the Element's `Workspace` struct (along with
81 * other elements), a pointer to the start of the buffer allocated for this
82 * portion of the workspace, and arguments to be used to initialise the
83 * workspace. The Element should consume as much of the buffer as it
84 * requires, initialise the Workspace, and then return the pointer to the
85 * next free byte of the buffer.
86 *
87 * See the below elements for an example of how this should work.
88 */
89
90#pragma once
91
92#include "depthwise.hpp"
93#include "depthfirst_driver.hpp"
Michael Tyler74921ee2023-04-12 17:43:17 +010094#include "utils.hpp"
ramelg018a164882022-04-07 02:42:52 +010095
96namespace arm_conv {
97namespace depthwise {
98namespace { // anonymous because we expect this to appear in several compilation units
99
100/* Arguments to use to size and initialise a workspace.
101 */
102template <class StratType, class OutputStage=Nothing>
103struct WorkspaceArgs
104{
105 const StratType *strategy;
106 const DepthwiseArgs &depthwise_args;
107 const OutputStage &output_stage;
108
109 WorkspaceArgs(const StratType *strat, const DepthwiseArgs &dwargs, const OutputStage &os = {})
110 : strategy(strat), depthwise_args(dwargs), output_stage(os)
111 {
112 }
113};
114
115
116/* Sometimes we use templated structs to fill in workspace types, the Empty
117 * element can be useful for when a blank element is required for some sets of
118 * parameters.
119 */
120struct EmptyElement
121{
122 struct Workspace {};
123
124 template <class StratType, class OutputStage>
125 static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &) { return 0; }
126
127 template <class WorkspaceType, class StratType, class OutputStage>
128 static void *initialise(WorkspaceType *, void *buffer, const WorkspaceArgs<StratType, OutputStage> &)
129 {
130 return buffer;
131 }
132};
133
134
135/* Store fused activations for a kernel.
136 *
137 * Activations are set based on the DepthwiseArgs.
138 */
139template <typename T, class OutputStage=Nothing>
140class ActivationsElement
141{
142 public:
143 struct Workspace
144 {
145 T activation_min, activation_max;
146 };
147
148 template <typename StratType>
149 static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &)
150 {
151 return 0;
152 }
153
154 template <class WorkspaceType, class StratType>
155 static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
156 {
157 ws->activation_min = static_cast<T>(-std::numeric_limits<float>::infinity());
158 ws->activation_max = static_cast<T>(std::numeric_limits<float>::infinity());
159
160 switch (args.depthwise_args.activation.type)
161 {
162 case arm_gemm::Activation::Type::BoundedReLU:
163 ws->activation_max = static_cast<T>(args.depthwise_args.activation.param1);
164 // Fall through
165 case arm_gemm::Activation::Type::ReLU:
166 ws->activation_min = static_cast<T>(0);
167 break;
168 default:
169 break;
170 }
171
172 return buffer;
173 }
174};
175
176/* Activation clamps are contained within `arm_gemm::Requantize32`, so if the
177 * output stage is one of these we substitute in an empty workspace element.
178 */
179template <typename T>
180class ActivationsElement<T, arm_gemm::Requantize32> : public EmptyElement
181{
182};
183
184
185/* Get the value with which to fill an input buffer. This defaults to `0`
186 * (which we return as a `char` since it gets used by `memset`).
187 */
188template <typename OutputStage>
189char get_input_buffer_fill_value(const OutputStage &)
190{
191 return 0;
192}
193
194/* In the case of kernels operating on quantized data, we need to fill the
195 * input buffer with the zero offset of the input tensor.
196 */
197template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp) __attribute__ ((unused));
198template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp)
199{
200 return qp.a_offset;
201}
202
203
204/* Container for a vector of padding values which can be safely consumed by the
205 * depthwise kernel. The padding values are initialised to either `0` or the
206 * zero offset of the input tensor (if quantized).
207 */
208template <typename T>
209class InputBufferElement
210{
211 public:
212 struct Workspace
213 {
214 T *input_buffer;
215 };
216
217 template <typename StratType, typename OutputStage>
218 static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &args)
219 {
Michael Tyler8deee9b2023-06-30 11:26:05 +0100220 return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
ramelg018a164882022-04-07 02:42:52 +0100221 }
222
223 template <class WorkspaceType, typename StratType, typename OutputStage>
224 static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
225 {
226 ws->input_buffer = reinterpret_cast<T*>(buffer);
227 memset(ws->input_buffer, get_input_buffer_fill_value(args.output_stage), get_element_size(args));
228 return reinterpret_cast<char *>(buffer) + get_element_size(args);
229 }
230};
231
232
233/* Container for an array of output pointers, and a buffer which can be used as
234 * a destination for unnecessary writes.
235 */
236template <typename T>
237class OutputArrayElement
238{
239 public:
240 struct Workspace
241 {
242 T **outptr_array;
243 T *output_buffer;
244 };
245
246 template <typename OutputStage>
247 static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
248 {
249 return sizeof_outptr_array(args) + sizeof_output_buffer(args);
250 }
251
252 template <class WorkspaceType, typename OutputStage>
253 static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
254 {
255 char *buffer_bytes = reinterpret_cast<char *>(buffer);
256
257 ws->outptr_array = reinterpret_cast<T **>(buffer_bytes);
258 buffer_bytes += sizeof_outptr_array(args);
259
260 ws->output_buffer = reinterpret_cast<T *>(buffer_bytes);
261 buffer_bytes += sizeof_output_buffer(args);
262
263 return buffer_bytes;
264 }
265
266 protected:
267 template <typename OutputStage>
268 static size_t sizeof_outptr_array(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
269 {
270 return sizeof(T **) * args.strategy->get_output_rows() * args.strategy->get_output_cols();
271 }
272
273 template <typename OutputStage>
274 static size_t sizeof_output_buffer(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
275 {
276 return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
277 }
278};
279
280
Michael Tyler8deee9b2023-06-30 11:26:05 +0100281/* Intermediate array to store results of premultiplication.
282 * Used as input to the kernel instead of the original input array.
283 */
284template <typename T>
285class IntermediateBufferElement
286{
287public:
288 struct Workspace
289 {
290 T *intermediate_buffer;
291 };
292
293 template <typename StratType, typename OutputStage>
294 static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &args)
295 {
296 auto cols = args.depthwise_args.input_cols + args.depthwise_args.kernel_cols;
297 auto rows = args.strategy->get_input_rows() + args.depthwise_args.kernel_rows;
298 auto channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
299 return sizeof(T) * cols * rows * channels;
300 }
301
302 template <class WorkspaceType, typename StratType, typename OutputStage>
303 static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args)
304 {
305 ws->intermediate_buffer = reinterpret_cast<T*>(buffer);
306 return reinterpret_cast<char *>(buffer) + get_element_size(args);
307 }
308};
309
310
ramelg018a164882022-04-07 02:42:52 +0100311/* Container for requantization parameters.
312 *
313 * This removes the distinction between per-layer and per-channel
314 * requantization parameters by providing a vector of requantization parameters
315 * regardless of whether per-layer or per-channel is selected.
316 */
317class RequantizationParametersElement
318{
319 public:
320 struct Workspace
321 {
322 const int32_t *bias, *requant_muls, *requant_shifts;
323 };
324
325 template <typename StratType>
326 static size_t get_element_size(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
327 {
328 return sizeof_bias(args) + sizeof_requant_muls(args) + sizeof_requant_shifts(args);
329 }
330
331 template <typename WorkspaceType, typename StratType>
332 static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
333 {
334 const auto n_output_channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier;
335 char *buffer_bytes = reinterpret_cast<char *>(buffer);
336
337 ws->bias = args.output_stage.bias;
338 ws->requant_muls = args.output_stage.per_channel_muls;
339 ws->requant_shifts = args.output_stage.per_channel_right_shifts;
340
341 if (ws->bias == nullptr)
342 {
343 ws->bias = reinterpret_cast<const int32_t *>(buffer_bytes);
344 memset(buffer_bytes, 0, sizeof_bias(args));
345 buffer_bytes += sizeof_bias(args);
346 }
347
348 if (ws->requant_muls == nullptr)
349 {
350 ws->requant_muls = reinterpret_cast<const int32_t *>(buffer_bytes);
351 auto muls = reinterpret_cast<int32_t *>(buffer_bytes);
352 buffer_bytes += sizeof_requant_muls(args);
353
354 for (auto n = 0u; n < n_output_channels; n++)
355 {
356 muls[n] = args.output_stage.per_layer_mul;
357 }
358 }
359
360 if (ws->requant_shifts == nullptr)
361 {
362 ws->requant_shifts = reinterpret_cast<int32_t *>(buffer_bytes);
363 auto shifts = reinterpret_cast<int32_t *>(buffer_bytes);
364 buffer_bytes += sizeof_requant_shifts(args);
365
366 for (auto n = 0u; n < n_output_channels; n++)
367 {
368 shifts[n] = args.output_stage.per_layer_right_shift;
369 }
370 }
371
372 return buffer_bytes;
373 }
374
375 protected:
376 template <typename StratType>
377 static size_t sizeof_bias(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
378 {
379 return args.output_stage.bias != nullptr ?
380 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
381 }
382
383 template <typename StratType>
384 static size_t sizeof_requant_muls(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
385 {
386 return args.output_stage.per_channel_muls != nullptr ?
387 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
388 }
389
390 template <typename StratType>
391 static size_t sizeof_requant_shifts(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args)
392 {
393 return args.output_stage.per_channel_right_shifts != nullptr ?
394 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels;
395 }
396};
397
398
399template <typename ...Elements>
400class Workspace;
401
402template <typename Element, typename ...Elements>
403class Workspace<Element, Elements...>
404{
405 public:
406 struct WorkspaceType : Element::Workspace, Workspace<Elements...>::WorkspaceType
407 {
408 };
409
410 template <class S, class T>
411 static void initialise(void *buffer, const WorkspaceArgs<S, T> &args)
412 {
413 // Allocate sufficient space for the struct, then initialise each of the
414 // elements in turn.
415 auto ws = reinterpret_cast<WorkspaceType *>(buffer);
416 initialise_elements(ws, ws + 1, args);
417 }
418
419 template <class S, class T=Nothing>
420 static size_t get_sizeof_workspace(const WorkspaceArgs<S, T> &args)
421 {
422 return sizeof(WorkspaceType) + get_element_sizes(args);
423 }
424
425 template <class S, class T>
426 static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &args)
427 {
428 return Element::get_element_size(args) + Workspace<Elements...>::get_element_sizes(args);
429 }
430
431 template <class WorkspaceType, class S, class T>
432 static void initialise_elements(WorkspaceType *ws, void *buffer, const WorkspaceArgs<S, T> &args)
433 {
434 buffer = Element::initialise(ws, buffer, args); // Get the next buffer
435 Workspace<Elements...>::initialise_elements(ws, buffer, args);
436 }
437};
438
439template <>
440class Workspace<>
441{
442 public:
443 struct WorkspaceType
444 {
445 };
446
447 template <class S, class T>
448 static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &)
449 {
450 return 0;
451 }
452
453 template <class WorkspaceType, class S, class T>
454 static void initialise_elements(WorkspaceType *, void *, const WorkspaceArgs<S, T> &)
455 {
456 }
457};
458
459} // namespace {anonymous}
460} // namespace depthwise
461} // namespace arm_conv