blob: 2b2e6f355505a02e9a72ff92141458fe86b4a0f2 [file] [log] [blame]
ramelg018a164882022-04-07 02:42:52 +01001/*
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +00002 * Copyright (c) 2022-2023 Arm Limited.
ramelg018a164882022-04-07 02:42:52 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "depthfirst_driver.hpp"
26#include "interleaves/generic.hpp"
27
28namespace arm_conv {
29namespace depthwise {
30
31template <typename OutputStage>
32class IPlanarStrategy
33{
34 public:
35 virtual ~IPlanarStrategy() = default;
36 virtual unsigned int get_output_rows(void) const = 0;
37 virtual arm_gemm::VLType get_vl_type(void) const = 0;
38
39 virtual size_t get_storage_size(const DepthwiseArgs &) const = 0;
40 virtual void pack_parameters(
41 const DepthwiseArgs &args, void *buffer,
42 const void *biases, const OutputStage &,
43 const void *weights, size_t ld_weight_col, size_t ld_weight_row
44 ) const = 0;
45};
46
47
48template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
49 typename OutputStage>
50struct PlanarKernelType;
51
52template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
53struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
54{
55 using Type = std::function<void(
56 const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
57 unsigned int pad_top, unsigned int valid_input_rows,
58 unsigned int pad_left, unsigned int valid_input_cols,
59 const TWeight *, const TAccum *,
60 TOutput **, const size_t *, const size_t *, unsigned int output_cols,
61 unsigned int start_channels, unsigned int valid_channels,
62 TAccum act_min, TAccum act_max
63 )>;
64
65 template <typename WorkspaceType>
66 static inline void execute(
67 const Type fn,
68 const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
69 unsigned int pad_top, unsigned int valid_input_rows,
70 unsigned int pad_left, unsigned int valid_input_cols,
71 const TWeight *weights, const TAccum *bias,
72 TOutput **outptrs, const size_t *outlds, const size_t *outvllds, unsigned int output_cols,
73 unsigned int start_channel, unsigned int valid_channels,
74 const Nothing &, const WorkspaceType *ws
75 )
76 {
77 fn(
78 inptr, ld_in_row, ld_in_col, ld_in_vl,
79 pad_top, valid_input_rows,
80 pad_left, valid_input_cols,
81 weights, bias,
82 outptrs, outlds, outvllds, output_cols,
83 start_channel, valid_channels,
84 ws->activation_min, ws->activation_max
85 );
86 }
87};
88
89template <typename TInput, typename TWeight, typename TOutput>
90struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
91{
92 using Type = std::function<void(
93 const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
94 unsigned int pad_top, unsigned int valid_input_rows,
95 unsigned int pad_left, unsigned int valid_input_cols,
96 const TWeight *,
97 TOutput **, const size_t *, const size_t *, unsigned int output_cols,
98 unsigned int start_channel, unsigned int valid_channels,
99 const arm_gemm::Requantize32 &
100 )>;
101
102 template <typename WorkspaceType>
103 static inline void execute(
104 const Type fn,
105 const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
106 unsigned int pad_top, unsigned int valid_input_rows,
107 unsigned int pad_left, unsigned int valid_input_cols,
108 const TWeight *weights, const int32_t *,
109 TOutput **outptrs, const size_t *outlds, const size_t *outldvls, unsigned int output_cols,
110 unsigned int first_channel, unsigned int valid_channels,
111 const arm_gemm::Requantize32 &qp, const WorkspaceType *
112 )
113 {
114 fn(
115 inptr, ld_in_row, ld_in_col, ld_in_vl,
116 pad_top, valid_input_rows,
117 pad_left, valid_input_cols,
118 weights,
119 outptrs, outlds, outldvls, output_cols,
120 first_channel, valid_channels,
121 qp
122 );
123 }
124};
125
126
127template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
128 typename TAccum=typename DefaultTAccum<TOutput>::Type,
129 typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
130class PlanarStrategy : public IPlanarStrategy<OutputStage>
131{
132 unsigned int m_kernel_rows, m_kernel_cols;
133 unsigned int m_stride_rows, m_stride_cols;
134 unsigned int m_output_rows;
135 arm_gemm::VLType m_vl_type;
136
137 protected:
138 virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
139 {
140 // Get the kernel point to pack at the given index; return false to
141 // indicate that this index (and all greater indices) is out of range.
142 if (m_kernel_rows * m_kernel_cols <= index)
143 return false;
144
145 y = index % m_kernel_cols;
146 x = index / m_kernel_cols;
147 return true;
148 }
149
150 virtual interleaves::PackingArguments get_kernel_packing_arguments(void) const
151 {
152 return interleaves::PackingArguments(
153 m_kernel_rows, m_kernel_cols, sizeof(TWeight),
154 false, sizeof(TAccum), // Don't pack the bias
155 m_vl_type, sizeof(TAccum), 1, // Accumulator depth of 1 TODO
156 [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
157 { return this->get_kernel_packing_point(idx, x, y); }
158 );
159 }
160
161 public:
162 PlanarStrategy(
163 unsigned int kernel_rows, unsigned int kernel_cols,
164 unsigned int stride_rows, unsigned int stride_cols,
165 unsigned int output_rows,
166 arm_gemm::VLType vl_type
167 ) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
168 m_stride_rows(stride_rows), m_stride_cols(stride_cols),
169 m_output_rows(output_rows), m_vl_type(vl_type)
170 {
171 }
172
173 unsigned int get_output_rows(void) const override { return m_output_rows; }
174 arm_gemm::VLType get_vl_type(void) const override { return m_vl_type; }
175
176 size_t get_storage_size(const DepthwiseArgs &args) const override
177 {
178 return interleaves::get_storage_size_generic(this->get_kernel_packing_arguments(), args);
179 }
180
181 void pack_parameters(
182 const DepthwiseArgs &args, void *buffer,
183 const void *biases, const OutputStage &,
184 const void *weights, size_t ld_weight_col, size_t ld_weight_row
185 ) const override
186 {
187 interleaves::pack_parameters_generic(
188 this->get_kernel_packing_arguments(), args,
189 buffer, biases, weights, ld_weight_col, ld_weight_row
190 );
191 }
192
193 using KernelType = typename PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
194 virtual KernelType get_kernel(void) const = 0;
195};
196
197
198namespace {
199
200template <typename T>
201struct OutputRowPtrsElement
202{
203 struct Workspace
204 {
205 T **output_row_ptrs;
206 size_t *output_ld_cols;
207 size_t *output_ld_vls; // Stride between vectors of channels
208 T *output_padding_buffer;
209 };
210
211 template <typename OutputStage>
212 static size_t get_element_size(const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
213 {
214 // We need one pointer and stride for each row of output, and an additional
215 // blob of memory into which padded stores can go.
216 return args.strategy->get_output_rows() * (sizeof(T *) + 2*sizeof(size_t)) +
217 get_vector_length<char>(args.strategy->get_vl_type());
218 }
219
220 template <typename WorkspaceType, typename OutputStage>
221 static void *initialise(WorkspaceType *ws, void *buffer,
222 const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
223 {
224 const auto n_rows = args.strategy->get_output_rows();
225 ws->output_row_ptrs = reinterpret_cast<T **>(buffer);
226 ws->output_ld_cols = reinterpret_cast<size_t *>(ws->output_row_ptrs + n_rows);
227 ws->output_ld_vls = ws->output_ld_cols + n_rows;
228 ws->output_padding_buffer = reinterpret_cast<T *>(ws->output_ld_vls + n_rows);
229 return ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type());
230 }
231};
232
233} // namespace {anonymous}
234
235
236template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
237 typename TAccum=typename DefaultTAccum<TOutput>::Type,
238 typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
239class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
240{
241 using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
242 using StrategyType = IPlanarStrategy<OutputStage>;
243 using WorkspaceManager = Workspace<
244 OutputRowPtrsElement<TOutput>,
245 ActivationsElement<TAccum, OutputStage>
246 >;
247 using WorkspaceType = typename WorkspaceManager::WorkspaceType;
248
249 std::unique_ptr<StrategyType> m_strat;
250 const TAccum *m_bias;
251 OutputStage m_os;
252
253 public:
254 DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
255 : Parent(args), m_strat(strat), m_bias(nullptr), m_os(os)
256 {
257 }
258
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100259 DepthwisePlanar(DepthwisePlanar &) = delete;
260 DepthwisePlanar &operator=(DepthwisePlanar &) = delete;
261
ramelg018a164882022-04-07 02:42:52 +0100262 size_t get_storage_size(void) const override
263 {
264 return m_strat->get_storage_size(this->m_args);
265 }
266
267 void pack_parameters(
268 void *buffer, const void *biases,
269 const void *weights, size_t ld_weight_col, size_t ld_weight_row
270 ) override
271 {
272 m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row);
273 this->m_bias = reinterpret_cast<const TAccum *>(biases);
274 depthwise_depthfirst::stash_bias(this->m_os, biases);
275 }
276
277 size_t get_working_size(unsigned int n_threads, unsigned int) const override
278 {
279 return this->get_working_size_per_thread() * n_threads;
280 }
281
282 protected:
283 /* Compute the amount of working space required for a single thread. */
284 virtual size_t get_working_size_per_thread(void) const
285 {
286 return WorkspaceManager::get_sizeof_workspace(
287 WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os));
288 }
289
290 /* Initialise the working space for a thread. */
291 virtual void initialise_working_space(void *buffer) const
292 {
293 WorkspaceManager::initialise(
294 buffer,
295 WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os)
296 );
297 }
298
299 /* Execute the kernel for a given chunk of work. */
300 virtual void execute_kernel(
301 const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
302 unsigned int pad_top, unsigned int valid_input_rows,
303 unsigned int pad_left, unsigned int valid_input_cols,
304 const TWeight *weights, const TAccum *bias,
305 TOutput *outptr, size_t ld_out_row, size_t ld_out_col, size_t ld_out_vl,
306 unsigned int valid_output_rows, unsigned int valid_output_cols,
307 unsigned int first_channel, unsigned int valid_channels,
308 WorkspaceType *ws
309 ) const
310 {
311 // Initialise the output pointers
312 for (auto i = 0u; i < m_strat->get_output_rows(); i++)
313 {
314 // Point at the output tensor for all valid rows; otherwise point at the
315 // padding buffer.
316 ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer;
317 ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0;
318 ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0;
319 outptr += ld_out_row;
320 }
321
322 // Execute the kernel
323 PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::template execute<WorkspaceType>(
324 reinterpret_cast<const PlanarStrategy<TInput, TWeight, TOutput, TAccum, OutputStage> *>(m_strat.get())->get_kernel(),
325 inptr, ld_in_row, ld_in_col, ld_in_vl,
326 pad_top, valid_input_rows, pad_left, valid_input_cols,
327 weights, bias,
328 ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls,
329 valid_output_cols, first_channel, valid_channels,
330 this->m_os, ws
331 );
332 }
333
334 void execute_internal(
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000335 const DepthwiseArgs &args,
ramelg018a164882022-04-07 02:42:52 +0100336 const void *input,
337 size_t ld_input_col,
338 size_t ld_input_row,
339 size_t ld_input_batch,
340 const void *parameters,
ramelg018a164882022-04-07 02:42:52 +0100341 void *output,
342 size_t ld_output_col,
343 size_t ld_output_row,
344 size_t ld_output_batch,
345 void *working_space,
346 unsigned int thread_id,
347 unsigned int n_threads
348 ) const override
349 {
350 // Get and initialise the working space for this thread.
351 void *thread_working_space =
352 static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
353 this->initialise_working_space(thread_working_space);
354 auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space);
355
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000356 const auto n_output_channels = args.input_channels * args.channel_multiplier;
ramelg018a164882022-04-07 02:42:52 +0100357 const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
358
359 // Get typed pointers
360 auto input_batch = reinterpret_cast<const TInput *>(input);
361 auto output_batch = reinterpret_cast<TOutput *>(output);
362 auto weights = reinterpret_cast<const TWeight *>(parameters);
363
364 // Iterate over batches
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000365 for (auto batches = args.n_batches; batches; batches--)
ramelg018a164882022-04-07 02:42:52 +0100366 {
367 // NOTE: Other loop orderings are possible and it would be worth
368 // investigating them.
369
370 // Within a batch, stripe threads across rows.
371 for (auto start_output_i = thread_id * m_strat->get_output_rows();
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000372 start_output_i < args.output_rows;
ramelg018a164882022-04-07 02:42:52 +0100373 start_output_i += n_threads * m_strat->get_output_rows())
374 {
375 // Determine what (if any padding) is required on the top/bottom of
376 // this row of the convolution.
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000377 const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
ramelg018a164882022-04-07 02:42:52 +0100378 const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
379 const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000380 const unsigned int valid_input_rows = input_i > args.input_rows ? 0 : args.input_rows - input_i;
381 const unsigned int valid_output_rows = args.output_rows - start_output_i;
ramelg018a164882022-04-07 02:42:52 +0100382
383 auto inptr_row = input_batch + input_i*ld_input_row;
384 auto outptr_row = output_batch + start_output_i * ld_output_row;
385
386 // Execute the kernel
387 this->execute_kernel(
388 inptr_row, ld_input_row, ld_input_col, vl,
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000389 input_pad_top, valid_input_rows, args.padding.left, args.input_cols,
ramelg018a164882022-04-07 02:42:52 +0100390 weights, this->m_bias,
391 outptr_row, ld_output_row, ld_output_col, vl,
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000392 valid_output_rows, args.output_cols,
ramelg018a164882022-04-07 02:42:52 +0100393 0 /* first channel */, n_output_channels,
394 ws
395 );
396 }
397
398 // Update the input and output pointers to account for batch
399 input_batch += ld_input_batch;
400 output_batch += ld_output_batch;
401 }
402 }
403};
404
405} // namespace depthwise
406} // namespace arm_conv