blob: 3d305b6d1824fe3b8d7296a1a9386e4bedaaee89 [file] [log] [blame]
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +00001/*
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +00002 * Copyright (c) 2021-2023 Arm Limited.
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#pragma once
26
ramelg018a164882022-04-07 02:42:52 +010027#include "depthwise_depthfirst.hpp"
28#include "interleaves/generic_quantized_dot_product.hpp"
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000029
Michele Di Giorgio0f033df2021-07-16 15:00:08 +010030#include <limits>
31
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000032namespace arm_conv {
33namespace depthwise {
34
ramelg018a164882022-04-07 02:42:52 +010035template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
36class DepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000037{
ramelg018a164882022-04-07 02:42:52 +010038 using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>;
39
40 protected:
41 virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000042 {
ramelg018a164882022-04-07 02:42:52 +010043 return interleaves::PackingArguments(
44 args.kernel_rows, args.kernel_cols, sizeof(TWeight),
45 true, sizeof(TAccum),
46 this->get_vl_type(),
47 sizeof(TAccum), 1,
48 [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000049 {
ramelg018a164882022-04-07 02:42:52 +010050 if (pos < args.kernel_rows * args.kernel_cols)
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000051 {
ramelg018a164882022-04-07 02:42:52 +010052 y = pos % args.kernel_cols;
53 x = pos / args.kernel_cols;
54 return true;
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000055 }
ramelg018a164882022-04-07 02:42:52 +010056 return false;
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000057 }
ramelg018a164882022-04-07 02:42:52 +010058 );
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000059 }
60
61 public:
ramelg018a164882022-04-07 02:42:52 +010062 using Parent::Parent;
63
64 size_t get_storage_size(const DepthwiseArgs &args) const override
65 {
66 return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
67 }
68
69 void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const Nothing &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
70 {
71 interleaves::pack_parameters_generic(
72 this->get_packing_args(args), args,
73 buffer, biases, weights, ld_weight_col, ld_weight_row
74 );
75 }
76
77 using KernelType = std::function<void(
78 const TInput *const *, // Input pointers
79 TOutput *const *, // Output pointers
80 const void *, // Ravelled bias, weights, and quantization parameters
81 unsigned int, // # output channels
82 TAccum, TAccum // Min and max activation clamps
83 )>;
84 virtual KernelType get_kernel(void) const = 0;
85};
86
87
88template <typename TInput, typename TWeight, typename TOutput>
89class DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t> : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
90{
91 using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
92
93 public:
94 using Parent::Parent;
95
96 size_t get_storage_size(const DepthwiseArgs &args) const override
97 {
98 return interleaves::quantized::get_storage_size(args, this->get_vl_type(), this->get_accumulator_depth_vl());
99 }
100
101 void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
102 {
103 interleaves::quantized::pack_parameters<TWeight>(
104 buffer, reinterpret_cast<const int32_t *>(biases),
105 reinterpret_cast<const TWeight *>(weights), ld_weight_col, ld_weight_row,
106 args, qp, this->get_vl_type(), this->get_accumulator_depth_vl()
107 );
108 }
109
110 using KernelType = std::function<void(
111 const TInput *const *, // Input pointers
112 TOutput *const *, // Output pointers
113 const void *, // Ravelled bias, weights, and quantization parameters
114 unsigned int, // # output channels
115 const arm_gemm::Requantize32 &
116 )>;
117 virtual KernelType get_kernel(void) const = 0;
118};
119
120
121template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
122class GenericDepthfirstMultiplierKernelStrategy
123{
124 const arm_gemm::VLType m_vl_type;
125 const unsigned int m_output_rows, m_output_cols;
126
127 public:
128 GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
129 : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000130 {
131 }
132
ramelg018a164882022-04-07 02:42:52 +0100133 virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
134
135 arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
136 unsigned int get_output_rows(void) const { return m_output_rows; }
137 unsigned int get_output_cols(void) const { return m_output_cols; }
138
139 using KernelType = std::function<void(
140 const TInput *const *, // Input pointers
141 TOutput *const *, // Output pointers
142 const TWeight *, // Ravelled weight parameters
143 const TAccum *, // Bias,
144 unsigned int, unsigned int, // Number of kernel points, number of output channels
145 TAccum, TAccum // Activation minimum and maximum
146 )>;
147 virtual KernelType get_kernel(void) const = 0;
148};
149
150template <typename TInput, typename TWeight, typename TOutput>
151class GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, int32_t>
152{
153 const arm_gemm::VLType m_vl_type;
154 const unsigned int m_output_rows, m_output_cols;
155
156 public:
157 GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
158 : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
159 {
160 }
161
162 virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
163
164 arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
165 unsigned int get_output_rows(void) const { return m_output_rows; }
166 unsigned int get_output_cols(void) const { return m_output_cols; }
167
168 using KernelType = std::function<void(
169 const TInput *const *, // Input pointers
170 TOutput *const *, // Output pointers
171 const TWeight *, // Ravelled weight parameters
172 const int32_t *, // Bias,
173 unsigned int, unsigned int, // Number of kernel points, number of output channels
174 const int32_t *, const int32_t *, const int32_t *, // Per-channel left-shifts, multipliers, right-shifts (need to account for start channel)
175 const arm_gemm::Requantize32 &
176 )>;
177 virtual KernelType get_kernel(void) const = 0;
178};
179
180template <typename TInput,
181 typename TWeight=TInput,
182 typename TOutput=TInput,
183 typename TAccum=typename DefaultTAccum<TInput>::Type,
184 typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
185class GenericDepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
186{
187 using KernelStrategyType = GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, TAccum>;
188 std::unique_ptr<KernelStrategyType> m_kern;
189
190 protected:
191 virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
192 {
193 return interleaves::PackingArguments(
194 args.kernel_rows, args.kernel_cols, sizeof(TWeight),
195 false, sizeof(TAccum),
196 this->get_vl_type(),
197 sizeof(TAccum), 1,
198 [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
199 {
200 if (pos < args.kernel_rows * args.kernel_cols)
201 {
202 y = pos % args.kernel_cols;
203 x = pos / args.kernel_cols;
204 return true;
205 }
206 return false;
207 }
208 );
209 }
210
211 public:
212 GenericDepthfirstMultiplierStrategy(KernelStrategyType *kern, const DepthwiseArgs &args)
213 : DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>(
214 kern->get_output_rows(), kern->get_output_cols(),
215 args.kernel_rows, args.kernel_cols,
216 args.stride_rows, args.stride_cols
217 ),
218 m_kern(kern)
219 {
220 };
221
222 arm_gemm::VLType get_vl_type(void) const override { return m_kern->get_vl_type(); }
223 const typename KernelStrategyType::KernelType get_kernel(void) const { return m_kern->get_kernel(); }
224
225 size_t get_storage_size(const DepthwiseArgs &args) const override
226 {
227 return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
228 }
229
230 void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
231 {
232 interleaves::pack_parameters_generic(
233 this->get_packing_args(args), args,
234 buffer, biases, weights, ld_weight_col, ld_weight_row
235 );
236 }
237};
238
239// Specialise elements of the wrapper based on the type of kernel.
240namespace depthfirst_multiplier {
241
242/* Working space element which contains a pointer for each row of input, a row
243 * of padding, and a space which can be used to construct an NCHW-ordered patch
244 * of input.
245 */
246template <typename T, bool IsGeneric=false, typename OutputStage=Nothing>
247class InputPatchElement
248{
249 public:
250 struct Workspace
251 {
252 constexpr static bool InputPatchIsGeneric = IsGeneric;
253 const T **input_rows;
254 T *input_padding;
255 T *input_patch;
256 };
257
258 static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
259 {
260 return sizeof_input_rows(args) + sizeof_input_padding(args) + sizeof_input_patch(args);
261 }
262
263 template <class WorkspaceType>
264 static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
265 {
266 auto buffer_bytes = reinterpret_cast<char *>(buffer);
267
268 ws->input_rows = reinterpret_cast<const T **>(buffer_bytes);
269 buffer_bytes += sizeof_input_rows(args);
270
271 ws->input_padding = reinterpret_cast<T*>(buffer_bytes);
272 buffer_bytes += sizeof_input_padding(args);
273
274 ws->input_patch = reinterpret_cast<T*>(buffer_bytes);
275 buffer_bytes += sizeof_input_patch(args);
276
277 // Initialise the padding
278 memset(ws->input_padding,
279 get_input_buffer_fill_value(args.output_stage),
280 sizeof_input_padding(args));
281
282 return buffer_bytes;
283 }
284
285 protected:
286 static size_t sizeof_input_rows(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
287 {
288 if (IsGeneric)
289 {
290 return sizeof(T *) * args.strategy->get_output_rows() * args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
291 }
292 else
293 {
294 return sizeof(T *) * args.strategy->get_input_rows();
295 }
296 }
297
298 static size_t sizeof_input_padding(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
299 {
300 // Round-up the number of columns to be a whole number of QUADS
301 auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
302 return sizeof(T) * input_cols;
303 }
304
305 static size_t sizeof_input_patch(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
306 {
307 if (IsGeneric)
308 {
309 // Round-up the number of columns to be a whole number of QUADS
310 auto output_cols = arm_gemm::roundup<size_t>(args.strategy->get_output_cols(), 16 / sizeof(T));
311 const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
312 return sizeof(T) * kernel_points * args.strategy->get_output_rows() * output_cols;
313 }
314 else
315 {
316 // Round-up the number of columns to be a whole number of QUADS
317 auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
318 return sizeof(T) * args.strategy->get_input_rows() * input_cols;
319 }
320 }
321};
322
323template <bool IsGeneric, typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
324struct StrategyType
325{
326 using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum>;
327
328 template <typename WorkspaceType>
329 static void execute(
330 const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
331 const OutputStage &, const unsigned int,
332 const void *parameters, const void *
333 )
334 {
335 strat->get_kernel()(
336 ws->input_rows,
337 ws->outptr_array,
338 parameters, args.channel_multiplier,
339 ws->activation_min, ws->activation_max
340 );
341 }
342};
343
344template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
345struct StrategyType<true, TInput, TWeight, TOutput, TAccum, OutputStage>
346{
347 using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>;
348
349 template <typename WorkspaceType>
350 static void execute(
351 const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
352 const OutputStage &, const unsigned int start_output_channel,
353 const void *parameters, const void *bias
354 )
355 {
356 strat->get_kernel()(
357 ws->input_rows, ws->outptr_array,
358 reinterpret_cast<const TWeight *>(parameters),
359 bias == nullptr ? nullptr : reinterpret_cast<const TAccum *>(bias) + start_output_channel,
360 strat->get_kernel_rows() * strat->get_kernel_cols(),
361 args.channel_multiplier,
362 ws->activation_min, ws->activation_max
363 );
364 }
365};
366
367template <typename TInput, typename TWeight, typename TOutput>
368struct StrategyType<false, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
369{
370 using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t>;
371
372 template <typename WorkspaceType>
373 static void execute(
374 const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
375 const arm_gemm::Requantize32 &qp, const unsigned int,
376 const void *parameters, const void *
377 )
378 {
379 strat->get_kernel()(
380 ws->input_rows,
381 ws->outptr_array,
382 parameters, args.channel_multiplier,
383 qp
384 );
385 }
386};
387
388template <typename TInput, typename TWeight, typename TOutput>
389struct StrategyType<true, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
390{
391 using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
392
393 template <typename WorkspaceType>
394 static void execute(
395 const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
396 const arm_gemm::Requantize32 &qp, const unsigned int start_output_channel,
397 const void *parameters, const void *
398 )
399 {
400 auto get_ptr = [start_output_channel] (const int32_t *ptr) -> const int32_t *
401 {
402 return ptr == nullptr ? nullptr : ptr + start_output_channel;
403 };
404
405 strat->get_kernel()(
406 ws->input_rows, ws->outptr_array,
407 reinterpret_cast<const TWeight *>(parameters),
408 get_ptr(qp.bias),
409 strat->get_kernel_rows() * strat->get_kernel_cols(),
410 args.channel_multiplier,
411 get_ptr(qp.per_channel_left_shifts),
412 get_ptr(qp.per_channel_muls),
413 get_ptr(qp.per_channel_right_shifts),
414 qp
415 );
416 }
417};
418
419template <bool IsGeneric> struct PrepareInputSample;
420
421template <> struct PrepareInputSample<false>
422{
423 template <typename WorkspaceType, typename StrategyType, typename T>
424 static void execute(
425 const DepthwiseArgs &, WorkspaceType *ws, const StrategyType *strat,
426 T *base_ptr, size_t ld_row, size_t ld_col,
427 const unsigned int input_pad_top, const unsigned int valid_rows,
428 const unsigned int input_pad_left, const unsigned int valid_cols
429 )
430 {
431 fill_nchw_patch_array(
432 ws->input_rows, ws->input_patch, strat->get_input_rows(), strat->get_input_cols(),
433 base_ptr, ld_row, ld_col,
434 ws->input_padding,
435 input_pad_top, valid_rows,
436 input_pad_left, valid_cols
437 );
438 }
439};
440
441template <> struct PrepareInputSample<true>
442{
443 template <typename WorkspaceType, typename StrategyType, typename T>
444 static void execute(
445 const DepthwiseArgs &args, WorkspaceType *ws, const StrategyType *strat,
446 T *base_ptr, size_t ld_row, size_t ld_col,
447 const unsigned int input_pad_top, const unsigned int valid_rows,
448 const unsigned int input_pad_left, const unsigned int valid_cols
449 )
450 {
451 fill_patch_array_generic_kernel(
452 ws->input_rows, ws->input_patch,
453 strat->get_output_rows(), strat->get_output_cols(),
454 args.kernel_rows, args.kernel_cols,
455 args.stride_rows, args.stride_cols,
456 base_ptr, ld_row, ld_col,
457 ws->input_padding,
458 input_pad_top, valid_rows,
459 input_pad_left, valid_cols
460 );
461 }
462};
463
464} // namespace depthfirst_multiplier
465
466template <typename TInput,
467 typename TWeight=TInput,
468 typename TOutput=TInput,
469 typename TAccum=typename DefaultTAccum<TInput>::Type,
470 bool is_generic=false,
471 typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
472class DepthwiseDepthfirstMultiplier : public DepthfirstDriver<TInput, TWeight, TOutput>
473{
474 protected:
475 using StratType = typename depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
476 using WorkspaceManager = Workspace<
477 OutputArrayElement<TOutput>,
478 depthfirst_multiplier::InputPatchElement<TInput, is_generic, OutputStage>,
479 ActivationsElement<TOutput, OutputStage>
480 >;
481 using WorkingSpace = typename WorkspaceManager::WorkspaceType;
482
483 OutputStage m_os; // Copy of the output parameters
484 const void *m_bias = nullptr; // Copy of the bias (should we need it)
485
486 public:
487 DepthwiseDepthfirstMultiplier(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
488 : DepthfirstDriver<TInput, TWeight, TOutput>(strat, args), m_os(os)
489 {
490 }
491
492 DepthwiseDepthfirstMultiplier(DepthwiseDepthfirstMultiplier &) = delete;
493 DepthwiseDepthfirstMultiplier &operator=(DepthwiseDepthfirstMultiplier &) = delete;
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000494
495 size_t get_storage_size(void) const override
496 {
ramelg018a164882022-04-07 02:42:52 +0100497 return reinterpret_cast<const StratType *>(this->m_strat.get())
498 ->get_storage_size(this->m_args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000499 }
500
ramelg018a164882022-04-07 02:42:52 +0100501 void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000502 {
ramelg018a164882022-04-07 02:42:52 +0100503 reinterpret_cast<const StratType *>(this->m_strat.get())
504 ->pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row);
505 m_bias = biases;
506 depthwise_depthfirst::stash_bias(m_os, biases);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000507 }
508
ramelg018a164882022-04-07 02:42:52 +0100509 size_t get_working_size_per_thread(const unsigned int n_input_channels) const override
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000510 {
ramelg018a164882022-04-07 02:42:52 +0100511 DepthwiseArgs args(this->m_args);
512 args.input_channels = n_input_channels;
513 return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000514 }
ramelg018a164882022-04-07 02:42:52 +0100515
516 void initialise_working_space(void *buffer, unsigned int n_input_channels) const override
517 {
518 DepthwiseArgs args(this->m_args);
519 args.input_channels = n_input_channels;
520 return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
521 }
522
523 void compute_tile_padded(
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000524 const DepthwiseArgs &args,
ramelg018a164882022-04-07 02:42:52 +0100525 unsigned int output_i, unsigned int output_j,
526 unsigned int output_channel_start, unsigned int output_channel_end,
527 const TensorSpec<const TInput *> &input,
528 const TensorSpec<TOutput *> &output,
529 const void *parameters,
530 void *working_space_raw
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000531 ) const override
532 {
ramelg018a164882022-04-07 02:42:52 +0100533 // Get the working space
534 auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000535
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000536 const int ii = static_cast<int>(output_i * args.stride_rows) - args.padding.top;
ramelg018a164882022-04-07 02:42:52 +0100537 const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
538 const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000539
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000540 const int ij = static_cast<int>(output_j * args.stride_cols) - args.padding.left;
ramelg018a164882022-04-07 02:42:52 +0100541 const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
542 const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
543
544 // Compute the output pointer array. We'll update this array after every
545 // invocation of the kernel.
546 fill_pointer_array(
547 ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
548 output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
549 output.ld_row, output.ld_col,
550 ws->output_buffer,
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000551 0, args.output_rows - output_i, // Top padding, # valid rows
552 0, args.output_cols - output_j // Left padding, # valid columns
ramelg018a164882022-04-07 02:42:52 +0100553 );
554
555 // Compute the parameter stride
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000556 DepthwiseArgs single_iter(args);
ramelg018a164882022-04-07 02:42:52 +0100557 single_iter.input_channels = 1;
558 const size_t parameter_stride = reinterpret_cast<const StratType *>(this->m_strat.get())
559 ->get_storage_size(single_iter);
560
561 for (; output_channel_start < output_channel_end;
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000562 output_channel_start += args.channel_multiplier)
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000563 {
ramelg018a164882022-04-07 02:42:52 +0100564 // Compute the input pointer array
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000565 const auto input_channel = output_channel_start / args.channel_multiplier;
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000566
ramelg018a164882022-04-07 02:42:52 +0100567 // Construct the input patch
568 depthfirst_multiplier::PrepareInputSample<is_generic>::execute(
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000569 args, ws, this->m_strat.get(),
ramelg018a164882022-04-07 02:42:52 +0100570 input.base + input_channel + input_i*input.ld_row + input_j*input.ld_col, input.ld_row, input.ld_col,
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000571 input_pad_top, args.input_rows - input_i,
572 input_pad_left, args.input_cols - input_j
ramelg018a164882022-04-07 02:42:52 +0100573 );
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000574
ramelg018a164882022-04-07 02:42:52 +0100575 // Execute the kernel
576 depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::execute(
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000577 args, ws, reinterpret_cast<const StratType *>(this->m_strat.get()), m_os, output_channel_start,
ramelg018a164882022-04-07 02:42:52 +0100578 parameters, m_bias
579 );
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000580
ramelg018a164882022-04-07 02:42:52 +0100581 // Update the output pointers
582 for (unsigned int n = 0; n < this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); n++)
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000583 {
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000584 ws->outptr_array[n] += args.channel_multiplier;
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000585 }
ramelg018a164882022-04-07 02:42:52 +0100586
587 // Progress the parameters
588 parameters = reinterpret_cast<const char *>(parameters) + parameter_stride;
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000589 }
590 }
591};
592
593} // namespace depthwise
594} // namespace arm_conv