blob: c3daaf04fea052d664f97f698b45583fc6117533 [file] [log] [blame]
ramelg018a164882022-04-07 02:42:52 +01001/*
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +00002 * Copyright (c) 2022-2023 Arm Limited.
ramelg018a164882022-04-07 02:42:52 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
Michael Tyler74921ee2023-04-12 17:43:17 +010025#pragma once
26
ramelg018a164882022-04-07 02:42:52 +010027#include "depthfirst_driver.hpp"
28#include "interleaves/generic.hpp"
29
30namespace arm_conv {
31namespace depthwise {
32
33template <typename OutputStage>
34class IPlanarStrategy
35{
36 public:
37 virtual ~IPlanarStrategy() = default;
38 virtual unsigned int get_output_rows(void) const = 0;
39 virtual arm_gemm::VLType get_vl_type(void) const = 0;
40
41 virtual size_t get_storage_size(const DepthwiseArgs &) const = 0;
42 virtual void pack_parameters(
43 const DepthwiseArgs &args, void *buffer,
44 const void *biases, const OutputStage &,
45 const void *weights, size_t ld_weight_col, size_t ld_weight_row
46 ) const = 0;
47};
48
49
50template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
51 typename OutputStage>
52struct PlanarKernelType;
53
54template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
55struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
56{
Michael Tyler74921ee2023-04-12 17:43:17 +010057 typedef void (*Type)(
ramelg018a164882022-04-07 02:42:52 +010058 const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
59 unsigned int pad_top, unsigned int valid_input_rows,
60 unsigned int pad_left, unsigned int valid_input_cols,
61 const TWeight *, const TAccum *,
62 TOutput **, const size_t *, const size_t *, unsigned int output_cols,
63 unsigned int start_channels, unsigned int valid_channels,
64 TAccum act_min, TAccum act_max
Michael Tyler74921ee2023-04-12 17:43:17 +010065 );
ramelg018a164882022-04-07 02:42:52 +010066
67 template <typename WorkspaceType>
68 static inline void execute(
69 const Type fn,
70 const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
71 unsigned int pad_top, unsigned int valid_input_rows,
72 unsigned int pad_left, unsigned int valid_input_cols,
73 const TWeight *weights, const TAccum *bias,
74 TOutput **outptrs, const size_t *outlds, const size_t *outvllds, unsigned int output_cols,
75 unsigned int start_channel, unsigned int valid_channels,
76 const Nothing &, const WorkspaceType *ws
77 )
78 {
79 fn(
80 inptr, ld_in_row, ld_in_col, ld_in_vl,
81 pad_top, valid_input_rows,
82 pad_left, valid_input_cols,
83 weights, bias,
84 outptrs, outlds, outvllds, output_cols,
85 start_channel, valid_channels,
86 ws->activation_min, ws->activation_max
87 );
88 }
89};
90
91template <typename TInput, typename TWeight, typename TOutput>
92struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
93{
Michael Tyler74921ee2023-04-12 17:43:17 +010094 typedef void (*Type)(
ramelg018a164882022-04-07 02:42:52 +010095 const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
96 unsigned int pad_top, unsigned int valid_input_rows,
97 unsigned int pad_left, unsigned int valid_input_cols,
98 const TWeight *,
99 TOutput **, const size_t *, const size_t *, unsigned int output_cols,
100 unsigned int start_channel, unsigned int valid_channels,
101 const arm_gemm::Requantize32 &
Michael Tyler74921ee2023-04-12 17:43:17 +0100102 );
ramelg018a164882022-04-07 02:42:52 +0100103
104 template <typename WorkspaceType>
105 static inline void execute(
106 const Type fn,
107 const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
108 unsigned int pad_top, unsigned int valid_input_rows,
109 unsigned int pad_left, unsigned int valid_input_cols,
110 const TWeight *weights, const int32_t *,
111 TOutput **outptrs, const size_t *outlds, const size_t *outldvls, unsigned int output_cols,
112 unsigned int first_channel, unsigned int valid_channels,
113 const arm_gemm::Requantize32 &qp, const WorkspaceType *
114 )
115 {
116 fn(
117 inptr, ld_in_row, ld_in_col, ld_in_vl,
118 pad_top, valid_input_rows,
119 pad_left, valid_input_cols,
120 weights,
121 outptrs, outlds, outldvls, output_cols,
122 first_channel, valid_channels,
123 qp
124 );
125 }
126};
127
128
129template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
130 typename TAccum=typename DefaultTAccum<TOutput>::Type,
131 typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
132class PlanarStrategy : public IPlanarStrategy<OutputStage>
133{
134 unsigned int m_kernel_rows, m_kernel_cols;
135 unsigned int m_stride_rows, m_stride_cols;
136 unsigned int m_output_rows;
137 arm_gemm::VLType m_vl_type;
138
139 protected:
140 virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
141 {
142 // Get the kernel point to pack at the given index; return false to
143 // indicate that this index (and all greater indices) is out of range.
144 if (m_kernel_rows * m_kernel_cols <= index)
145 return false;
146
147 y = index % m_kernel_cols;
148 x = index / m_kernel_cols;
149 return true;
150 }
151
152 virtual interleaves::PackingArguments get_kernel_packing_arguments(void) const
153 {
154 return interleaves::PackingArguments(
155 m_kernel_rows, m_kernel_cols, sizeof(TWeight),
Michael Tyler8deee9b2023-06-30 11:26:05 +0100156 false, sizeof(TAccum), true, // Don't pack the bias
ramelg018a164882022-04-07 02:42:52 +0100157 m_vl_type, sizeof(TAccum), 1, // Accumulator depth of 1 TODO
158 [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
159 { return this->get_kernel_packing_point(idx, x, y); }
160 );
161 }
162
163 public:
164 PlanarStrategy(
165 unsigned int kernel_rows, unsigned int kernel_cols,
166 unsigned int stride_rows, unsigned int stride_cols,
167 unsigned int output_rows,
168 arm_gemm::VLType vl_type
169 ) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
170 m_stride_rows(stride_rows), m_stride_cols(stride_cols),
171 m_output_rows(output_rows), m_vl_type(vl_type)
172 {
173 }
174
175 unsigned int get_output_rows(void) const override { return m_output_rows; }
176 arm_gemm::VLType get_vl_type(void) const override { return m_vl_type; }
177
178 size_t get_storage_size(const DepthwiseArgs &args) const override
179 {
180 return interleaves::get_storage_size_generic(this->get_kernel_packing_arguments(), args);
181 }
182
183 void pack_parameters(
184 const DepthwiseArgs &args, void *buffer,
185 const void *biases, const OutputStage &,
186 const void *weights, size_t ld_weight_col, size_t ld_weight_row
187 ) const override
188 {
189 interleaves::pack_parameters_generic(
190 this->get_kernel_packing_arguments(), args,
191 buffer, biases, weights, ld_weight_col, ld_weight_row
192 );
193 }
194
195 using KernelType = typename PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
196 virtual KernelType get_kernel(void) const = 0;
197};
198
199
200namespace {
201
202template <typename T>
203struct OutputRowPtrsElement
204{
205 struct Workspace
206 {
207 T **output_row_ptrs;
208 size_t *output_ld_cols;
209 size_t *output_ld_vls; // Stride between vectors of channels
210 T *output_padding_buffer;
211 };
212
213 template <typename OutputStage>
214 static size_t get_element_size(const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
215 {
216 // We need one pointer and stride for each row of output, and an additional
217 // blob of memory into which padded stores can go.
218 return args.strategy->get_output_rows() * (sizeof(T *) + 2*sizeof(size_t)) +
219 get_vector_length<char>(args.strategy->get_vl_type());
220 }
221
222 template <typename WorkspaceType, typename OutputStage>
223 static void *initialise(WorkspaceType *ws, void *buffer,
224 const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
225 {
226 const auto n_rows = args.strategy->get_output_rows();
227 ws->output_row_ptrs = reinterpret_cast<T **>(buffer);
228 ws->output_ld_cols = reinterpret_cast<size_t *>(ws->output_row_ptrs + n_rows);
229 ws->output_ld_vls = ws->output_ld_cols + n_rows;
230 ws->output_padding_buffer = reinterpret_cast<T *>(ws->output_ld_vls + n_rows);
231 return ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type());
232 }
233};
234
235} // namespace {anonymous}
236
237
238template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
239 typename TAccum=typename DefaultTAccum<TOutput>::Type,
240 typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
241class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
242{
243 using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
244 using StrategyType = IPlanarStrategy<OutputStage>;
245 using WorkspaceManager = Workspace<
246 OutputRowPtrsElement<TOutput>,
247 ActivationsElement<TAccum, OutputStage>
248 >;
249 using WorkspaceType = typename WorkspaceManager::WorkspaceType;
250
251 std::unique_ptr<StrategyType> m_strat;
252 const TAccum *m_bias;
253 OutputStage m_os;
254
255 public:
256 DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
257 : Parent(args), m_strat(strat), m_bias(nullptr), m_os(os)
258 {
259 }
260
Viet-Hoa Do03b29712022-06-01 11:47:14 +0100261 DepthwisePlanar(DepthwisePlanar &) = delete;
262 DepthwisePlanar &operator=(DepthwisePlanar &) = delete;
263
ramelg018a164882022-04-07 02:42:52 +0100264 size_t get_storage_size(void) const override
265 {
266 return m_strat->get_storage_size(this->m_args);
267 }
268
269 void pack_parameters(
270 void *buffer, const void *biases,
271 const void *weights, size_t ld_weight_col, size_t ld_weight_row
272 ) override
273 {
274 m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row);
275 this->m_bias = reinterpret_cast<const TAccum *>(biases);
276 depthwise_depthfirst::stash_bias(this->m_os, biases);
277 }
278
Michael Tyler8deee9b2023-06-30 11:26:05 +0100279 size_t get_working_size(unsigned int n_threads) const override
ramelg018a164882022-04-07 02:42:52 +0100280 {
281 return this->get_working_size_per_thread() * n_threads;
282 }
283
284 protected:
285 /* Compute the amount of working space required for a single thread. */
286 virtual size_t get_working_size_per_thread(void) const
287 {
288 return WorkspaceManager::get_sizeof_workspace(
289 WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os));
290 }
291
292 /* Initialise the working space for a thread. */
293 virtual void initialise_working_space(void *buffer) const
294 {
295 WorkspaceManager::initialise(
296 buffer,
297 WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os)
298 );
299 }
300
301 /* Execute the kernel for a given chunk of work. */
302 virtual void execute_kernel(
303 const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
304 unsigned int pad_top, unsigned int valid_input_rows,
305 unsigned int pad_left, unsigned int valid_input_cols,
306 const TWeight *weights, const TAccum *bias,
307 TOutput *outptr, size_t ld_out_row, size_t ld_out_col, size_t ld_out_vl,
308 unsigned int valid_output_rows, unsigned int valid_output_cols,
309 unsigned int first_channel, unsigned int valid_channels,
310 WorkspaceType *ws
311 ) const
312 {
313 // Initialise the output pointers
314 for (auto i = 0u; i < m_strat->get_output_rows(); i++)
315 {
316 // Point at the output tensor for all valid rows; otherwise point at the
317 // padding buffer.
318 ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer;
319 ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0;
320 ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0;
321 outptr += ld_out_row;
322 }
323
324 // Execute the kernel
325 PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::template execute<WorkspaceType>(
326 reinterpret_cast<const PlanarStrategy<TInput, TWeight, TOutput, TAccum, OutputStage> *>(m_strat.get())->get_kernel(),
327 inptr, ld_in_row, ld_in_col, ld_in_vl,
328 pad_top, valid_input_rows, pad_left, valid_input_cols,
329 weights, bias,
330 ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls,
331 valid_output_cols, first_channel, valid_channels,
332 this->m_os, ws
333 );
334 }
335
336 void execute_internal(
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000337 const DepthwiseArgs &args,
ramelg018a164882022-04-07 02:42:52 +0100338 const void *input,
339 size_t ld_input_col,
340 size_t ld_input_row,
341 size_t ld_input_batch,
342 const void *parameters,
ramelg018a164882022-04-07 02:42:52 +0100343 void *output,
344 size_t ld_output_col,
345 size_t ld_output_row,
346 size_t ld_output_batch,
347 void *working_space,
348 unsigned int thread_id,
349 unsigned int n_threads
350 ) const override
351 {
352 // Get and initialise the working space for this thread.
353 void *thread_working_space =
354 static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
355 this->initialise_working_space(thread_working_space);
356 auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space);
357
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000358 const auto n_output_channels = args.input_channels * args.channel_multiplier;
ramelg018a164882022-04-07 02:42:52 +0100359 const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
360
361 // Get typed pointers
362 auto input_batch = reinterpret_cast<const TInput *>(input);
363 auto output_batch = reinterpret_cast<TOutput *>(output);
364 auto weights = reinterpret_cast<const TWeight *>(parameters);
365
366 // Iterate over batches
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000367 for (auto batches = args.n_batches; batches; batches--)
ramelg018a164882022-04-07 02:42:52 +0100368 {
369 // NOTE: Other loop orderings are possible and it would be worth
370 // investigating them.
371
372 // Within a batch, stripe threads across rows.
373 for (auto start_output_i = thread_id * m_strat->get_output_rows();
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000374 start_output_i < args.output_rows;
ramelg018a164882022-04-07 02:42:52 +0100375 start_output_i += n_threads * m_strat->get_output_rows())
376 {
377 // Determine what (if any padding) is required on the top/bottom of
378 // this row of the convolution.
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000379 const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
ramelg018a164882022-04-07 02:42:52 +0100380 const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
381 const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000382 const unsigned int valid_input_rows = input_i > args.input_rows ? 0 : args.input_rows - input_i;
383 const unsigned int valid_output_rows = args.output_rows - start_output_i;
ramelg018a164882022-04-07 02:42:52 +0100384
385 auto inptr_row = input_batch + input_i*ld_input_row;
386 auto outptr_row = output_batch + start_output_i * ld_output_row;
387
388 // Execute the kernel
389 this->execute_kernel(
390 inptr_row, ld_input_row, ld_input_col, vl,
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000391 input_pad_top, valid_input_rows, args.padding.left, args.input_cols,
ramelg018a164882022-04-07 02:42:52 +0100392 weights, this->m_bias,
393 outptr_row, ld_output_row, ld_output_col, vl,
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000394 valid_output_rows, args.output_cols,
ramelg018a164882022-04-07 02:42:52 +0100395 0 /* first channel */, n_output_channels,
396 ws
397 );
398 }
399
400 // Update the input and output pointers to account for batch
401 input_batch += ld_input_batch;
402 output_batch += ld_output_batch;
403 }
404 }
405};
406
407} // namespace depthwise
408} // namespace arm_conv