blob: 836402e83d62bd949ffdac2efdd2799c221096f4 [file] [log] [blame]
ramelg01a1f78512022-06-29 16:28:10 +01001/*
2 * Copyright (c) 2022 Arm Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#pragma once
26
27#include "src/cpu/kernels/assembly/arm_gemm.hpp"
28#include <cstddef>
29
30namespace arm_conv
31{
32struct Shape2D
33{
34 unsigned int rows, cols;
35};
36
37struct ConvolutionArgs
38{
39 unsigned int n_batches;
40 Shape2D input_shape;
41 unsigned int n_input_channels;
42 unsigned int pad_top, pad_left;
43 Shape2D output_shape;
44 unsigned int n_output_channels;
45 Shape2D kernel_shape;
46 arm_gemm::Activation activation;
47
48 ConvolutionArgs(
49 unsigned int n_batches,
50 const Shape2D &input_shape,
51 unsigned int n_input_channels,
52 unsigned int pad_top, unsigned int pad_left,
53 const Shape2D &output_shape,
54 unsigned int n_output_channels,
55 const Shape2D kernel_shape,
56 const arm_gemm::Activation &activation = {})
57 : n_batches(n_batches), input_shape(input_shape), n_input_channels(n_input_channels), pad_top(pad_top), pad_left(pad_left), output_shape(output_shape), n_output_channels(n_output_channels),
58 kernel_shape(kernel_shape), activation(activation)
59 {
60 }
61};
62
63namespace winograd
64{
65/* Constrain the selected Winograd implementation.
66 */
67struct WinogradConfig
68{
69 unsigned int output_rows = 0, output_cols = 0;
70 std::string input_transform_filter = "";
71 std::string output_transform_filter = "";
72 std::string weight_transform_filter = "";
73};
74
75/* Struct describing (suggested) memory layout within the Winograd domain.
76 */
77struct WinogradDomainSpec
78{
79 size_t weight_matrix_size_bytes, input_matrix_size_bytes, output_matrix_size_bytes;
80
81 size_t weight_ld_matrix, weight_ld_row;
82 size_t input_ld_batch, input_ld_matrix, input_ld_row;
83 size_t output_ld_batch, output_ld_matrix, output_ld_row;
84};
85
86class ITransformCommon
87{
88public:
89 virtual ~ITransformCommon() = default;
90
91 // Get the name of the transform
92 virtual const std::string &get_name(void) const = 0;
93};
94
95namespace weight_transform
96{
97class ITransform : public ITransformCommon
98{
99public:
100 ~ITransform() = default;
101
102 virtual unsigned int get_kernel_rows(void) const = 0;
103 virtual unsigned int get_kernel_cols(void) const = 0;
104
105 virtual unsigned int get_transformed_tile_rows(void) const = 0;
106 virtual unsigned int get_transformed_tile_cols(void) const = 0;
107
108 void execute(
109 const ConvolutionArgs &args,
110 const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
111 void *outptr, const WinogradDomainSpec &wds,
112 unsigned int thread_id, unsigned int n_threads) const
113 {
114 this->execute(
115 args, inptr, ld_in_row, ld_in_col, ld_input_channel,
116 outptr, wds.weight_ld_matrix, wds.weight_ld_row,
117 thread_id, n_threads);
118 }
119
120 virtual void execute(
121 const ConvolutionArgs &args,
122 const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
123 void *outptr, size_t ld_out_matrix, size_t ld_out_row,
124 unsigned int thread_id, unsigned int n_threads) const = 0;
125};
126
127} // namespace weight_transform
128
129namespace input_transform
130{
131class ITransform : public ITransformCommon
132{
133public:
134 ~ITransform() = default;
135
136 virtual unsigned int get_input_rows(void) const = 0;
137 virtual unsigned int get_input_cols(void) const = 0;
138
139 virtual size_t get_working_space_size(
140 const ConvolutionArgs &args,
141 unsigned int n_threads) const = 0;
142
143 void execute(
144 const ConvolutionArgs &args,
145 const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
146 void *outptr, const WinogradDomainSpec &wds,
147 void *working_space, unsigned int thread_id, unsigned int n_threads) const
148 {
149 this->execute(
150 args, inptr, ld_in_batch, ld_in_row, ld_in_col,
151 outptr, wds.input_ld_batch, wds.input_ld_matrix, wds.input_ld_row,
152 working_space, thread_id, n_threads);
153 }
154
155 virtual void execute(
156 const ConvolutionArgs &args,
157 const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
158 void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
159 void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
160};
161
162} // namespace input_transform
163
164namespace output_transform
165{
166class ITransform : public ITransformCommon
167{
168public:
169 ~ITransform() = default;
170
171 virtual unsigned int get_input_rows(void) const = 0;
172 virtual unsigned int get_input_cols(void) const = 0;
173
174 virtual unsigned int get_output_rows(void) const = 0;
175 virtual unsigned int get_output_cols(void) const = 0;
176
177 virtual unsigned int get_kernel_rows(void) const = 0;
178 virtual unsigned int get_kernel_cols(void) const = 0;
179
180 virtual size_t get_working_space_size(
181 const ConvolutionArgs &args,
182 unsigned int n_threads) const = 0;
183
184 void execute(
185 const ConvolutionArgs &args,
186 const void *inptr, const WinogradDomainSpec &wds,
187 const void *bias,
188 void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
189 void *working_space, unsigned int thread_id, unsigned int n_threads) const
190 {
191 this->execute(
192 args,
193 inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row,
194 bias,
195 outptr, ld_out_batch, ld_out_row, ld_out_col,
196 working_space, thread_id, n_threads);
197 }
198
199 virtual void execute(
200 const ConvolutionArgs &args,
201 const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
202 const void *bias,
203 void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
204 void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
205};
206
207} // namespace output_transform
208
209struct WinogradImpl
210{
211 const output_transform::ITransform *output_transform = nullptr;
212 const weight_transform::ITransform *weight_transform = nullptr;
213 const input_transform::ITransform *input_transform = nullptr;
214 std::unique_ptr<arm_gemm::GemmArgs> gemm_args;
215 WinogradDomainSpec winograd_spec;
216};
217
218/* Get pointers to Winograd transforms for the given convolution problem.
219 *
220 * Assigns to the pointers in the `dest` struct and returns true or false to
221 * indicate whether the given problem can be executed or not.
222 */
223template <typename TIn, typename TWeight = TIn, typename TOut = TIn, typename TWinogradIn = TIn, typename TWinogradOut = TOut>
224bool get_implementation(
225 WinogradImpl &dest, // Destination for the selected implementation
226 const CPUInfo *,
227 const ConvolutionArgs &,
228 int max_threads,
229 bool fast_mode,
230 const WinogradConfig *,
231 const arm_gemm::GemmConfig *);
232
233} // namespace winograd
234} // namespace arm_conv