blob: e58467b0f476cc23d9ef2b26a0393bc51a720a64 [file] [log] [blame]
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +00001/*
ramelg018a164882022-04-07 02:42:52 +01002 * Copyright (c) 2021-2022 Arm Limited.
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#pragma once
26
ramelg018a164882022-04-07 02:42:52 +010027#include "depthwise_depthfirst.hpp"
28#include "interleaves/generic_quantized_dot_product.hpp"
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000029
30#ifdef CYCLE_PROFILING
31#include "profiler.hpp"
32#endif
33
Michele Di Giorgio0f033df2021-07-16 15:00:08 +010034#include <limits>
35
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000036namespace arm_conv {
37namespace depthwise {
38
ramelg018a164882022-04-07 02:42:52 +010039template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
40class DepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000041{
ramelg018a164882022-04-07 02:42:52 +010042 using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, Nothing>;
43
44 protected:
45 virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000046 {
ramelg018a164882022-04-07 02:42:52 +010047 return interleaves::PackingArguments(
48 args.kernel_rows, args.kernel_cols, sizeof(TWeight),
49 true, sizeof(TAccum),
50 this->get_vl_type(),
51 sizeof(TAccum), 1,
52 [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000053 {
ramelg018a164882022-04-07 02:42:52 +010054 if (pos < args.kernel_rows * args.kernel_cols)
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000055 {
ramelg018a164882022-04-07 02:42:52 +010056 y = pos % args.kernel_cols;
57 x = pos / args.kernel_cols;
58 return true;
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000059 }
ramelg018a164882022-04-07 02:42:52 +010060 return false;
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000061 }
ramelg018a164882022-04-07 02:42:52 +010062 );
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +000063 }
64
65 public:
ramelg018a164882022-04-07 02:42:52 +010066 using Parent::Parent;
67
68 size_t get_storage_size(const DepthwiseArgs &args) const override
69 {
70 return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
71 }
72
73 void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const Nothing &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
74 {
75 interleaves::pack_parameters_generic(
76 this->get_packing_args(args), args,
77 buffer, biases, weights, ld_weight_col, ld_weight_row
78 );
79 }
80
81 using KernelType = std::function<void(
82 const TInput *const *, // Input pointers
83 TOutput *const *, // Output pointers
84 const void *, // Ravelled bias, weights, and quantization parameters
85 unsigned int, // # output channels
86 TAccum, TAccum // Min and max activation clamps
87 )>;
88 virtual KernelType get_kernel(void) const = 0;
89};
90
91
92template <typename TInput, typename TWeight, typename TOutput>
93class DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t> : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
94{
95 using Parent = DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
96
97 public:
98 using Parent::Parent;
99
100 size_t get_storage_size(const DepthwiseArgs &args) const override
101 {
102 return interleaves::quantized::get_storage_size(args, this->get_vl_type(), this->get_accumulator_depth_vl());
103 }
104
105 void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const arm_gemm::Requantize32 &qp, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
106 {
107 interleaves::quantized::pack_parameters<TWeight>(
108 buffer, reinterpret_cast<const int32_t *>(biases),
109 reinterpret_cast<const TWeight *>(weights), ld_weight_col, ld_weight_row,
110 args, qp, this->get_vl_type(), this->get_accumulator_depth_vl()
111 );
112 }
113
114 using KernelType = std::function<void(
115 const TInput *const *, // Input pointers
116 TOutput *const *, // Output pointers
117 const void *, // Ravelled bias, weights, and quantization parameters
118 unsigned int, // # output channels
119 const arm_gemm::Requantize32 &
120 )>;
121 virtual KernelType get_kernel(void) const = 0;
122};
123
124
125template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
126class GenericDepthfirstMultiplierKernelStrategy
127{
128 const arm_gemm::VLType m_vl_type;
129 const unsigned int m_output_rows, m_output_cols;
130
131 public:
132 GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
133 : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000134 {
135 }
136
ramelg018a164882022-04-07 02:42:52 +0100137 virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
138
139 arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
140 unsigned int get_output_rows(void) const { return m_output_rows; }
141 unsigned int get_output_cols(void) const { return m_output_cols; }
142
143 using KernelType = std::function<void(
144 const TInput *const *, // Input pointers
145 TOutput *const *, // Output pointers
146 const TWeight *, // Ravelled weight parameters
147 const TAccum *, // Bias,
148 unsigned int, unsigned int, // Number of kernel points, number of output channels
149 TAccum, TAccum // Activation minimum and maximum
150 )>;
151 virtual KernelType get_kernel(void) const = 0;
152};
153
154template <typename TInput, typename TWeight, typename TOutput>
155class GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, int32_t>
156{
157 const arm_gemm::VLType m_vl_type;
158 const unsigned int m_output_rows, m_output_cols;
159
160 public:
161 GenericDepthfirstMultiplierKernelStrategy(unsigned int output_rows, unsigned int output_cols, arm_gemm::VLType vl_type)
162 : m_vl_type(vl_type), m_output_rows(output_rows), m_output_cols(output_cols)
163 {
164 }
165
166 virtual ~GenericDepthfirstMultiplierKernelStrategy() = default;
167
168 arm_gemm::VLType get_vl_type(void) const { return m_vl_type; }
169 unsigned int get_output_rows(void) const { return m_output_rows; }
170 unsigned int get_output_cols(void) const { return m_output_cols; }
171
172 using KernelType = std::function<void(
173 const TInput *const *, // Input pointers
174 TOutput *const *, // Output pointers
175 const TWeight *, // Ravelled weight parameters
176 const int32_t *, // Bias,
177 unsigned int, unsigned int, // Number of kernel points, number of output channels
178 const int32_t *, const int32_t *, const int32_t *, // Per-channel left-shifts, multipliers, right-shifts (need to account for start channel)
179 const arm_gemm::Requantize32 &
180 )>;
181 virtual KernelType get_kernel(void) const = 0;
182};
183
184template <typename TInput,
185 typename TWeight=TInput,
186 typename TOutput=TInput,
187 typename TAccum=typename DefaultTAccum<TInput>::Type,
188 typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
189class GenericDepthfirstMultiplierStrategy : public DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>
190{
191 using KernelStrategyType = GenericDepthfirstMultiplierKernelStrategy<TInput, TWeight, TOutput, TAccum>;
192 std::unique_ptr<KernelStrategyType> m_kern;
193
194 protected:
195 virtual interleaves::PackingArguments get_packing_args(const DepthwiseArgs &args) const
196 {
197 return interleaves::PackingArguments(
198 args.kernel_rows, args.kernel_cols, sizeof(TWeight),
199 false, sizeof(TAccum),
200 this->get_vl_type(),
201 sizeof(TAccum), 1,
202 [args] (unsigned int pos, unsigned int &x, unsigned int &y) -> bool
203 {
204 if (pos < args.kernel_rows * args.kernel_cols)
205 {
206 y = pos % args.kernel_cols;
207 x = pos / args.kernel_cols;
208 return true;
209 }
210 return false;
211 }
212 );
213 }
214
215 public:
216 GenericDepthfirstMultiplierStrategy(KernelStrategyType *kern, const DepthwiseArgs &args)
217 : DepthwiseDepthfirstStrategyCommon<TInput, TWeight, TOutput, TAccum, OutputStage>(
218 kern->get_output_rows(), kern->get_output_cols(),
219 args.kernel_rows, args.kernel_cols,
220 args.stride_rows, args.stride_cols
221 ),
222 m_kern(kern)
223 {
224 };
225
226 arm_gemm::VLType get_vl_type(void) const override { return m_kern->get_vl_type(); }
227 const typename KernelStrategyType::KernelType get_kernel(void) const { return m_kern->get_kernel(); }
228
229 size_t get_storage_size(const DepthwiseArgs &args) const override
230 {
231 return interleaves::get_storage_size_generic(this->get_packing_args(args), args);
232 }
233
234 void pack_parameters(const DepthwiseArgs &args, void *buffer, const void *biases, const OutputStage &, const void *weights, size_t ld_weight_col, size_t ld_weight_row) const override
235 {
236 interleaves::pack_parameters_generic(
237 this->get_packing_args(args), args,
238 buffer, biases, weights, ld_weight_col, ld_weight_row
239 );
240 }
241};
242
243// Specialise elements of the wrapper based on the type of kernel.
244namespace depthfirst_multiplier {
245
246/* Working space element which contains a pointer for each row of input, a row
247 * of padding, and a space which can be used to construct an NCHW-ordered patch
248 * of input.
249 */
250template <typename T, bool IsGeneric=false, typename OutputStage=Nothing>
251class InputPatchElement
252{
253 public:
254 struct Workspace
255 {
256 constexpr static bool InputPatchIsGeneric = IsGeneric;
257 const T **input_rows;
258 T *input_padding;
259 T *input_patch;
260 };
261
262 static size_t get_element_size(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
263 {
264 return sizeof_input_rows(args) + sizeof_input_padding(args) + sizeof_input_patch(args);
265 }
266
267 template <class WorkspaceType>
268 static void *initialise(WorkspaceType *ws, void *buffer, const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
269 {
270 auto buffer_bytes = reinterpret_cast<char *>(buffer);
271
272 ws->input_rows = reinterpret_cast<const T **>(buffer_bytes);
273 buffer_bytes += sizeof_input_rows(args);
274
275 ws->input_padding = reinterpret_cast<T*>(buffer_bytes);
276 buffer_bytes += sizeof_input_padding(args);
277
278 ws->input_patch = reinterpret_cast<T*>(buffer_bytes);
279 buffer_bytes += sizeof_input_patch(args);
280
281 // Initialise the padding
282 memset(ws->input_padding,
283 get_input_buffer_fill_value(args.output_stage),
284 sizeof_input_padding(args));
285
286 return buffer_bytes;
287 }
288
289 protected:
290 static size_t sizeof_input_rows(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
291 {
292 if (IsGeneric)
293 {
294 return sizeof(T *) * args.strategy->get_output_rows() * args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
295 }
296 else
297 {
298 return sizeof(T *) * args.strategy->get_input_rows();
299 }
300 }
301
302 static size_t sizeof_input_padding(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
303 {
304 // Round-up the number of columns to be a whole number of QUADS
305 auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
306 return sizeof(T) * input_cols;
307 }
308
309 static size_t sizeof_input_patch(const WorkspaceArgs<IDepthfirstStrategy, OutputStage> &args)
310 {
311 if (IsGeneric)
312 {
313 // Round-up the number of columns to be a whole number of QUADS
314 auto output_cols = arm_gemm::roundup<size_t>(args.strategy->get_output_cols(), 16 / sizeof(T));
315 const auto kernel_points = args.depthwise_args.kernel_rows * args.depthwise_args.kernel_cols;
316 return sizeof(T) * kernel_points * args.strategy->get_output_rows() * output_cols;
317 }
318 else
319 {
320 // Round-up the number of columns to be a whole number of QUADS
321 auto input_cols = arm_gemm::roundup<size_t>(args.strategy->get_input_cols(), 16 / sizeof(T));
322 return sizeof(T) * args.strategy->get_input_rows() * input_cols;
323 }
324 }
325};
326
327template <bool IsGeneric, typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
328struct StrategyType
329{
330 using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum>;
331
332 template <typename WorkspaceType>
333 static void execute(
334 const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
335 const OutputStage &, const unsigned int,
336 const void *parameters, const void *
337 )
338 {
339 strat->get_kernel()(
340 ws->input_rows,
341 ws->outptr_array,
342 parameters, args.channel_multiplier,
343 ws->activation_min, ws->activation_max
344 );
345 }
346};
347
348template <typename TInput, typename TWeight, typename TOutput, typename TAccum, typename OutputStage>
349struct StrategyType<true, TInput, TWeight, TOutput, TAccum, OutputStage>
350{
351 using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, TAccum, OutputStage>;
352
353 template <typename WorkspaceType>
354 static void execute(
355 const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
356 const OutputStage &, const unsigned int start_output_channel,
357 const void *parameters, const void *bias
358 )
359 {
360 strat->get_kernel()(
361 ws->input_rows, ws->outptr_array,
362 reinterpret_cast<const TWeight *>(parameters),
363 bias == nullptr ? nullptr : reinterpret_cast<const TAccum *>(bias) + start_output_channel,
364 strat->get_kernel_rows() * strat->get_kernel_cols(),
365 args.channel_multiplier,
366 ws->activation_min, ws->activation_max
367 );
368 }
369};
370
371template <typename TInput, typename TWeight, typename TOutput>
372struct StrategyType<false, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
373{
374 using Type = DepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t>;
375
376 template <typename WorkspaceType>
377 static void execute(
378 const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
379 const arm_gemm::Requantize32 &qp, const unsigned int,
380 const void *parameters, const void *
381 )
382 {
383 strat->get_kernel()(
384 ws->input_rows,
385 ws->outptr_array,
386 parameters, args.channel_multiplier,
387 qp
388 );
389 }
390};
391
392template <typename TInput, typename TWeight, typename TOutput>
393struct StrategyType<true, TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
394{
395 using Type = GenericDepthfirstMultiplierStrategy<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>;
396
397 template <typename WorkspaceType>
398 static void execute(
399 const DepthwiseArgs &args, const WorkspaceType *ws, const Type *strat,
400 const arm_gemm::Requantize32 &qp, const unsigned int start_output_channel,
401 const void *parameters, const void *
402 )
403 {
404 auto get_ptr = [start_output_channel] (const int32_t *ptr) -> const int32_t *
405 {
406 return ptr == nullptr ? nullptr : ptr + start_output_channel;
407 };
408
409 strat->get_kernel()(
410 ws->input_rows, ws->outptr_array,
411 reinterpret_cast<const TWeight *>(parameters),
412 get_ptr(qp.bias),
413 strat->get_kernel_rows() * strat->get_kernel_cols(),
414 args.channel_multiplier,
415 get_ptr(qp.per_channel_left_shifts),
416 get_ptr(qp.per_channel_muls),
417 get_ptr(qp.per_channel_right_shifts),
418 qp
419 );
420 }
421};
422
423template <bool IsGeneric> struct PrepareInputSample;
424
425template <> struct PrepareInputSample<false>
426{
427 template <typename WorkspaceType, typename StrategyType, typename T>
428 static void execute(
429 const DepthwiseArgs &, WorkspaceType *ws, const StrategyType *strat,
430 T *base_ptr, size_t ld_row, size_t ld_col,
431 const unsigned int input_pad_top, const unsigned int valid_rows,
432 const unsigned int input_pad_left, const unsigned int valid_cols
433 )
434 {
435 fill_nchw_patch_array(
436 ws->input_rows, ws->input_patch, strat->get_input_rows(), strat->get_input_cols(),
437 base_ptr, ld_row, ld_col,
438 ws->input_padding,
439 input_pad_top, valid_rows,
440 input_pad_left, valid_cols
441 );
442 }
443};
444
445template <> struct PrepareInputSample<true>
446{
447 template <typename WorkspaceType, typename StrategyType, typename T>
448 static void execute(
449 const DepthwiseArgs &args, WorkspaceType *ws, const StrategyType *strat,
450 T *base_ptr, size_t ld_row, size_t ld_col,
451 const unsigned int input_pad_top, const unsigned int valid_rows,
452 const unsigned int input_pad_left, const unsigned int valid_cols
453 )
454 {
455 fill_patch_array_generic_kernel(
456 ws->input_rows, ws->input_patch,
457 strat->get_output_rows(), strat->get_output_cols(),
458 args.kernel_rows, args.kernel_cols,
459 args.stride_rows, args.stride_cols,
460 base_ptr, ld_row, ld_col,
461 ws->input_padding,
462 input_pad_top, valid_rows,
463 input_pad_left, valid_cols
464 );
465 }
466};
467
468} // namespace depthfirst_multiplier
469
470template <typename TInput,
471 typename TWeight=TInput,
472 typename TOutput=TInput,
473 typename TAccum=typename DefaultTAccum<TInput>::Type,
474 bool is_generic=false,
475 typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
476class DepthwiseDepthfirstMultiplier : public DepthfirstDriver<TInput, TWeight, TOutput>
477{
478 protected:
479 using StratType = typename depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
480 using WorkspaceManager = Workspace<
481 OutputArrayElement<TOutput>,
482 depthfirst_multiplier::InputPatchElement<TInput, is_generic, OutputStage>,
483 ActivationsElement<TOutput, OutputStage>
484 >;
485 using WorkingSpace = typename WorkspaceManager::WorkspaceType;
486
487 OutputStage m_os; // Copy of the output parameters
488 const void *m_bias = nullptr; // Copy of the bias (should we need it)
489
490 public:
491 DepthwiseDepthfirstMultiplier(StratType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
492 : DepthfirstDriver<TInput, TWeight, TOutput>(strat, args), m_os(os)
493 {
494 }
495
496 DepthwiseDepthfirstMultiplier(DepthwiseDepthfirstMultiplier &) = delete;
497 DepthwiseDepthfirstMultiplier &operator=(DepthwiseDepthfirstMultiplier &) = delete;
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000498
499 size_t get_storage_size(void) const override
500 {
ramelg018a164882022-04-07 02:42:52 +0100501 return reinterpret_cast<const StratType *>(this->m_strat.get())
502 ->get_storage_size(this->m_args);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000503 }
504
ramelg018a164882022-04-07 02:42:52 +0100505 void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000506 {
ramelg018a164882022-04-07 02:42:52 +0100507 reinterpret_cast<const StratType *>(this->m_strat.get())
508 ->pack_parameters(this->m_args, buffer, biases, m_os, weights, ld_weight_col, ld_weight_row);
509 m_bias = biases;
510 depthwise_depthfirst::stash_bias(m_os, biases);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000511 }
512
ramelg018a164882022-04-07 02:42:52 +0100513 size_t get_working_size_per_thread(const unsigned int n_input_channels) const override
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000514 {
ramelg018a164882022-04-07 02:42:52 +0100515 DepthwiseArgs args(this->m_args);
516 args.input_channels = n_input_channels;
517 return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000518 }
ramelg018a164882022-04-07 02:42:52 +0100519
520 void initialise_working_space(void *buffer, unsigned int n_input_channels) const override
521 {
522 DepthwiseArgs args(this->m_args);
523 args.input_channels = n_input_channels;
524 return WorkspaceManager::initialise(buffer, WorkspaceArgs<IDepthfirstStrategy, OutputStage>(this->m_strat.get(), args, m_os));
525 }
526
527 void compute_tile_padded(
528 unsigned int output_i, unsigned int output_j,
529 unsigned int output_channel_start, unsigned int output_channel_end,
530 const TensorSpec<const TInput *> &input,
531 const TensorSpec<TOutput *> &output,
532 const void *parameters,
533 void *working_space_raw
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000534 ) const override
535 {
ramelg018a164882022-04-07 02:42:52 +0100536 // Get the working space
537 auto ws = reinterpret_cast<WorkingSpace *>(working_space_raw);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000538
ramelg018a164882022-04-07 02:42:52 +0100539 const int ii = static_cast<int>(output_i * this->m_args.stride_rows) - this->m_args.padding.top;
540 const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
541 const auto input_i = static_cast<unsigned int>(ii < 0 ? 0 : ii);
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000542
ramelg018a164882022-04-07 02:42:52 +0100543 const int ij = static_cast<int>(output_j * this->m_args.stride_cols) - this->m_args.padding.left;
544 const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
545 const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
546
547 // Compute the output pointer array. We'll update this array after every
548 // invocation of the kernel.
549 fill_pointer_array(
550 ws->outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
551 output.base + output_i*output.ld_row + output_j*output.ld_col + output_channel_start,
552 output.ld_row, output.ld_col,
553 ws->output_buffer,
554 0, this->m_args.output_rows - output_i, // Top padding, # valid rows
555 0, this->m_args.output_cols - output_j // Left padding, # valid columns
556 );
557
558 // Compute the parameter stride
559 DepthwiseArgs single_iter(this->m_args);
560 single_iter.input_channels = 1;
561 const size_t parameter_stride = reinterpret_cast<const StratType *>(this->m_strat.get())
562 ->get_storage_size(single_iter);
563
564 for (; output_channel_start < output_channel_end;
565 output_channel_start += this->m_args.channel_multiplier)
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000566 {
ramelg018a164882022-04-07 02:42:52 +0100567 // Compute the input pointer array
568 const auto input_channel = output_channel_start / this->m_args.channel_multiplier;
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000569
ramelg018a164882022-04-07 02:42:52 +0100570 // Construct the input patch
571 depthfirst_multiplier::PrepareInputSample<is_generic>::execute(
572 this->m_args, ws, this->m_strat.get(),
573 input.base + input_channel + input_i*input.ld_row + input_j*input.ld_col, input.ld_row, input.ld_col,
574 input_pad_top, this->m_args.input_rows - input_i,
575 input_pad_left, this->m_args.input_cols - input_j
576 );
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000577
ramelg018a164882022-04-07 02:42:52 +0100578 // Execute the kernel
579 depthfirst_multiplier::StrategyType<is_generic, TInput, TWeight, TOutput, TAccum, OutputStage>::execute(
580 this->m_args, ws, reinterpret_cast<const StratType *>(this->m_strat.get()), m_os, output_channel_start,
581 parameters, m_bias
582 );
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000583
ramelg018a164882022-04-07 02:42:52 +0100584 // Update the output pointers
585 for (unsigned int n = 0; n < this->m_strat->get_output_rows() * this->m_strat->get_output_cols(); n++)
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000586 {
ramelg018a164882022-04-07 02:42:52 +0100587 ws->outptr_array[n] += this->m_args.channel_multiplier;
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000588 }
ramelg018a164882022-04-07 02:42:52 +0100589
590 // Progress the parameters
591 parameters = reinterpret_cast<const char *>(parameters) + parameter_stride;
Michele Di Giorgiod02d5ed2021-01-22 09:47:04 +0000592 }
593 }
594};
595
596} // namespace depthwise
597} // namespace arm_conv