| /* |
| * Copyright (c) 2022-2023 Arm Limited. |
| * |
| * SPDX-License-Identifier: MIT |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a copy |
| * of this software and associated documentation files (the "Software"), to |
| * deal in the Software without restriction, including without limitation the |
| * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| * sell copies of the Software, and to permit persons to whom the Software is |
| * furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice shall be included in all |
| * copies or substantial portions of the Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| * SOFTWARE. |
| */ |
| |
| /* Depthwise kernel drivers commonly require a per-thread blob of working space |
| * in which to store parameters required by the depthwise implementations. The |
| * composition of this working space varies with the driver, kernel, and data |
| * types -- but the tasks of requesting sufficient space, allocating buffer |
| * space, and performing initialisation of the working space are common. |
| * |
| * The classes in this file consist of a number of working space "Elements" |
| * (which are logical units of functionality) and a Workspace type which allows |
| * for compile time composition of elements into a single working space type. |
| * |
| * Creating a workspace |
| * ==================== |
| * |
| * A new workspace type can be created by combining Elements as an argument to |
| * the Workspace class. For instance: |
| * |
| * Workspace< |
| * depthwise_depthfirst::InputArrayElement<float>, |
| * InputBufferElement<float>, |
| * OutputArrayElement<float> |
| * > |
| * |
| * Creates a new Workspace consisting of the given elements. The workspace type |
| * contained within this class (`Workspace<...>::WorkspaceType`) is equivalent to: |
| * |
| * struct WorkspaceType |
| * { |
| * const float **inptr_array; // From InputArrayElement<float> |
| * float *input_buffer; // From InputBufferElement<float> |
| * float **outptr_array; // From OutputArrayElement<float> |
| * float *output_buffer; // From OutputArrayElement<float> |
| * }; |
| * |
| * Calling `Workspace<...>::get_sizeof_workspace(...)` will return the amount |
| * of space required to store the above struct and the elements contained |
| * within it. Once this space has been allocated, the workspace can be |
| * initialised by calling `Workspace<...>::initialise` with a pointer to the |
| * buffer and the same arguments. This will place a struct of type |
| * `Workspace<...>::WorkspaceType` at the start of the buffer, and share the |
| * remaining space between the specified elements. As this is all done at |
| * compile time, later code can access elements from the `WorkspaceType` by |
| * name. |
| * |
| * Writing a new element |
| * ===================== |
| * |
| * Each Element must provide: |
| * - A struct called "Workspace" containing the variables contained within |
| * this portion of the workspace. |
| * - A static method called `get_element_size` which returns the amount of |
| * buffer space required by this element of the workspace (NOT including the |
| * size of the Workspace struct). For example, an element which stores a |
| * vector of pointers will return the amount of space required top store the |
| * vector. |
| * - A static method called `initialise` which accepts a pointer to a struct |
| * which will be composed of the Element's `Workspace` struct (along with |
| * other elements), a pointer to the start of the buffer allocated for this |
| * portion of the workspace, and arguments to be used to initialise the |
| * workspace. The Element should consume as much of the buffer as it |
| * requires, initialise the Workspace, and then return the pointer to the |
| * next free byte of the buffer. |
| * |
| * See the below elements for an example of how this should work. |
| */ |
| |
| #pragma once |
| |
| #include "depthwise.hpp" |
| #include "depthfirst_driver.hpp" |
| #include "utils.hpp" |
| |
| namespace arm_conv { |
| namespace depthwise { |
| namespace { // anonymous because we expect this to appear in several compilation units |
| |
| /* Arguments to use to size and initialise a workspace. |
| */ |
| template <class StratType, class OutputStage=Nothing> |
| struct WorkspaceArgs |
| { |
| const StratType *strategy; |
| const DepthwiseArgs &depthwise_args; |
| const OutputStage &output_stage; |
| |
| WorkspaceArgs(const StratType *strat, const DepthwiseArgs &dwargs, const OutputStage &os = {}) |
| : strategy(strat), depthwise_args(dwargs), output_stage(os) |
| { |
| } |
| }; |
| |
| |
| /* Sometimes we use templated structs to fill in workspace types, the Empty |
| * element can be useful for when a blank element is required for some sets of |
| * parameters. |
| */ |
| struct EmptyElement |
| { |
| struct Workspace {}; |
| |
| template <class StratType, class OutputStage> |
| static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &) { return 0; } |
| |
| template <class WorkspaceType, class StratType, class OutputStage> |
| static void *initialise(WorkspaceType *, void *buffer, const WorkspaceArgs<StratType, OutputStage> &) |
| { |
| return buffer; |
| } |
| }; |
| |
| |
| /* Store fused activations for a kernel. |
| * |
| * Activations are set based on the DepthwiseArgs. |
| */ |
| template <typename T, class OutputStage=Nothing> |
| class ActivationsElement |
| { |
| public: |
| struct Workspace |
| { |
| T activation_min, activation_max; |
| }; |
| |
| template <typename StratType> |
| static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &) |
| { |
| return 0; |
| } |
| |
| template <class WorkspaceType, class StratType> |
| static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args) |
| { |
| ws->activation_min = static_cast<T>(-std::numeric_limits<float>::infinity()); |
| ws->activation_max = static_cast<T>(std::numeric_limits<float>::infinity()); |
| |
| switch (args.depthwise_args.activation.type) |
| { |
| case arm_gemm::Activation::Type::BoundedReLU: |
| ws->activation_max = static_cast<T>(args.depthwise_args.activation.param1); |
| // Fall through |
| case arm_gemm::Activation::Type::ReLU: |
| ws->activation_min = static_cast<T>(0); |
| break; |
| default: |
| break; |
| } |
| |
| return buffer; |
| } |
| }; |
| |
| /* Activation clamps are contained within `arm_gemm::Requantize32`, so if the |
| * output stage is one of these we substitute in an empty workspace element. |
| */ |
| template <typename T> |
| class ActivationsElement<T, arm_gemm::Requantize32> : public EmptyElement |
| { |
| }; |
| |
| |
| /* Get the value with which to fill an input buffer. This defaults to `0` |
| * (which we return as a `char` since it gets used by `memset`). |
| */ |
| template <typename OutputStage> |
| char get_input_buffer_fill_value(const OutputStage &) |
| { |
| return 0; |
| } |
| |
| /* In the case of kernels operating on quantized data, we need to fill the |
| * input buffer with the zero offset of the input tensor. |
| */ |
| template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp) __attribute__ ((unused)); |
| template <> char get_input_buffer_fill_value(const arm_gemm::Requantize32 &qp) |
| { |
| return qp.a_offset; |
| } |
| |
| |
| /* Container for a vector of padding values which can be safely consumed by the |
| * depthwise kernel. The padding values are initialised to either `0` or the |
| * zero offset of the input tensor (if quantized). |
| */ |
| template <typename T> |
| class InputBufferElement |
| { |
| public: |
| struct Workspace |
| { |
| T *input_buffer; |
| }; |
| |
| template <typename StratType, typename OutputStage> |
| static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &args) |
| { |
| return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier; |
| } |
| |
| template <class WorkspaceType, typename StratType, typename OutputStage> |
| static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args) |
| { |
| ws->input_buffer = reinterpret_cast<T*>(buffer); |
| memset(ws->input_buffer, get_input_buffer_fill_value(args.output_stage), get_element_size(args)); |
| return reinterpret_cast<char *>(buffer) + get_element_size(args); |
| } |
| }; |
| |
| |
| /* Container for an array of output pointers, and a buffer which can be used as |
| * a destination for unnecessary writes. |
| */ |
| template <typename T> |
| class OutputArrayElement |
| { |
| public: |
| struct Workspace |
| { |
| T **outptr_array; |
| T *output_buffer; |
| }; |
| |
| template <typename OutputStage> |
| static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args) |
| { |
| return sizeof_outptr_array(args) + sizeof_output_buffer(args); |
| } |
| |
| template <class WorkspaceType, typename OutputStage> |
| static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args) |
| { |
| char *buffer_bytes = reinterpret_cast<char *>(buffer); |
| |
| ws->outptr_array = reinterpret_cast<T **>(buffer_bytes); |
| buffer_bytes += sizeof_outptr_array(args); |
| |
| ws->output_buffer = reinterpret_cast<T *>(buffer_bytes); |
| buffer_bytes += sizeof_output_buffer(args); |
| |
| return buffer_bytes; |
| } |
| |
| protected: |
| template <typename OutputStage> |
| static size_t sizeof_outptr_array(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args) |
| { |
| return sizeof(T **) * args.strategy->get_output_rows() * args.strategy->get_output_cols(); |
| } |
| |
| template <typename OutputStage> |
| static size_t sizeof_output_buffer(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args) |
| { |
| return sizeof(T) * args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier; |
| } |
| }; |
| |
| |
| /* Intermediate array to store results of premultiplication. |
| * Used as input to the kernel instead of the original input array. |
| */ |
| template <typename T> |
| class IntermediateBufferElement |
| { |
| public: |
| struct Workspace |
| { |
| T *intermediate_buffer; |
| }; |
| |
| template <typename StratType, typename OutputStage> |
| static size_t get_element_size(const WorkspaceArgs<StratType, OutputStage> &args) |
| { |
| auto cols = args.depthwise_args.input_cols + args.depthwise_args.kernel_cols; |
| auto rows = args.strategy->get_input_rows() + args.depthwise_args.kernel_rows; |
| auto channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier; |
| return sizeof(T) * cols * rows * channels; |
| } |
| |
| template <class WorkspaceType, typename StratType, typename OutputStage> |
| static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, OutputStage> &args) |
| { |
| ws->intermediate_buffer = reinterpret_cast<T*>(buffer); |
| return reinterpret_cast<char *>(buffer) + get_element_size(args); |
| } |
| }; |
| |
| |
| /* Container for requantization parameters. |
| * |
| * This removes the distinction between per-layer and per-channel |
| * requantization parameters by providing a vector of requantization parameters |
| * regardless of whether per-layer or per-channel is selected. |
| */ |
| class RequantizationParametersElement |
| { |
| public: |
| struct Workspace |
| { |
| const int32_t *bias, *requant_muls, *requant_shifts; |
| }; |
| |
| template <typename StratType> |
| static size_t get_element_size(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args) |
| { |
| return sizeof_bias(args) + sizeof_requant_muls(args) + sizeof_requant_shifts(args); |
| } |
| |
| template <typename WorkspaceType, typename StratType> |
| static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args) |
| { |
| const auto n_output_channels = args.depthwise_args.input_channels * args.depthwise_args.channel_multiplier; |
| char *buffer_bytes = reinterpret_cast<char *>(buffer); |
| |
| ws->bias = args.output_stage.bias; |
| ws->requant_muls = args.output_stage.per_channel_muls; |
| ws->requant_shifts = args.output_stage.per_channel_right_shifts; |
| |
| if (ws->bias == nullptr) |
| { |
| ws->bias = reinterpret_cast<const int32_t *>(buffer_bytes); |
| memset(buffer_bytes, 0, sizeof_bias(args)); |
| buffer_bytes += sizeof_bias(args); |
| } |
| |
| if (ws->requant_muls == nullptr) |
| { |
| ws->requant_muls = reinterpret_cast<const int32_t *>(buffer_bytes); |
| auto muls = reinterpret_cast<int32_t *>(buffer_bytes); |
| buffer_bytes += sizeof_requant_muls(args); |
| |
| for (auto n = 0u; n < n_output_channels; n++) |
| { |
| muls[n] = args.output_stage.per_layer_mul; |
| } |
| } |
| |
| if (ws->requant_shifts == nullptr) |
| { |
| ws->requant_shifts = reinterpret_cast<int32_t *>(buffer_bytes); |
| auto shifts = reinterpret_cast<int32_t *>(buffer_bytes); |
| buffer_bytes += sizeof_requant_shifts(args); |
| |
| for (auto n = 0u; n < n_output_channels; n++) |
| { |
| shifts[n] = args.output_stage.per_layer_right_shift; |
| } |
| } |
| |
| return buffer_bytes; |
| } |
| |
| protected: |
| template <typename StratType> |
| static size_t sizeof_bias(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args) |
| { |
| return args.output_stage.bias != nullptr ? |
| 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels; |
| } |
| |
| template <typename StratType> |
| static size_t sizeof_requant_muls(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args) |
| { |
| return args.output_stage.per_channel_muls != nullptr ? |
| 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels; |
| } |
| |
| template <typename StratType> |
| static size_t sizeof_requant_shifts(const WorkspaceArgs<StratType, arm_gemm::Requantize32> &args) |
| { |
| return args.output_stage.per_channel_right_shifts != nullptr ? |
| 0 : sizeof(int32_t) * args.depthwise_args.channel_multiplier * args.depthwise_args.input_channels; |
| } |
| }; |
| |
| |
| template <typename ...Elements> |
| class Workspace; |
| |
| template <typename Element, typename ...Elements> |
| class Workspace<Element, Elements...> |
| { |
| public: |
| struct WorkspaceType : Element::Workspace, Workspace<Elements...>::WorkspaceType |
| { |
| }; |
| |
| template <class S, class T> |
| static void initialise(void *buffer, const WorkspaceArgs<S, T> &args) |
| { |
| // Allocate sufficient space for the struct, then initialise each of the |
| // elements in turn. |
| auto ws = reinterpret_cast<WorkspaceType *>(buffer); |
| initialise_elements(ws, ws + 1, args); |
| } |
| |
| template <class S, class T=Nothing> |
| static size_t get_sizeof_workspace(const WorkspaceArgs<S, T> &args) |
| { |
| return sizeof(WorkspaceType) + get_element_sizes(args); |
| } |
| |
| template <class S, class T> |
| static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &args) |
| { |
| return Element::get_element_size(args) + Workspace<Elements...>::get_element_sizes(args); |
| } |
| |
| template <class WorkspaceType, class S, class T> |
| static void initialise_elements(WorkspaceType *ws, void *buffer, const WorkspaceArgs<S, T> &args) |
| { |
| buffer = Element::initialise(ws, buffer, args); // Get the next buffer |
| Workspace<Elements...>::initialise_elements(ws, buffer, args); |
| } |
| }; |
| |
| template <> |
| class Workspace<> |
| { |
| public: |
| struct WorkspaceType |
| { |
| }; |
| |
| template <class S, class T> |
| static inline size_t get_element_sizes(const WorkspaceArgs<S, T> &) |
| { |
| return 0; |
| } |
| |
| template <class WorkspaceType, class S, class T> |
| static void initialise_elements(WorkspaceType *, void *, const WorkspaceArgs<S, T> &) |
| { |
| } |
| }; |
| |
| } // namespace {anonymous} |
| } // namespace depthwise |
| } // namespace arm_conv |