blob: 592ee7282006c2ca40034754067091c6d2f46505 [file] [log] [blame]
ramelg018a164882022-04-07 02:42:52 +01001/*
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +00002 * Copyright (c) 2022-2023 Arm Limited.
ramelg018a164882022-04-07 02:42:52 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#pragma once
26
Michael Tyler74921ee2023-04-12 17:43:17 +010027#include "depthwise.hpp"
28#include "utils.hpp"
ramelg018a164882022-04-07 02:42:52 +010029
30namespace arm_conv {
31namespace depthwise {
32
33template <typename T> struct DefaultTAccum { using Type = T; };
34template <> struct DefaultTAccum<int8_t> { using Type = int32_t; };
35template <> struct DefaultTAccum<uint8_t> { using Type = int32_t; };
36
37template <typename T> struct DefaultOutputStage { using Type = Nothing; };
38template <> struct DefaultOutputStage<int8_t> { using Type = arm_gemm::Requantize32; };
39template <> struct DefaultOutputStage<uint8_t> { using Type = arm_gemm::Requantize32; };
40
41class IDepthfirstStrategy
42{
43 public:
44 virtual ~IDepthfirstStrategy() = default;
45
46 virtual unsigned int get_input_rows() const = 0;
47 virtual unsigned int get_input_cols() const = 0;
48
49 virtual unsigned int get_output_rows() const = 0;
50 virtual unsigned int get_output_cols() const = 0;
51};
52
53
54template <typename T>
55struct TensorSpec
56{
57 T base;
58 size_t ld_row, ld_col;
59
60 TensorSpec(T ptr, size_t ld_row, size_t ld_col)
61 : base(ptr), ld_row(ld_row), ld_col(ld_col) {}
62};
63
64
65template <typename TInput, typename TWeight, typename TOutput>
66class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
67{
68 protected:
69 using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
70
71 // The strategy which we're applying to solve the depthwise convolution.
72 std::unique_ptr<const IDepthfirstStrategy> m_strat;
73
74 /* Compute the amount of working space required for a single thread. */
Michael Tyler8deee9b2023-06-30 11:26:05 +010075 virtual size_t get_working_size_per_thread() const = 0;
ramelg018a164882022-04-07 02:42:52 +010076
77 /* Initialise the working space for a thread. */
Michael Tyler8deee9b2023-06-30 11:26:05 +010078 virtual void initialise_working_space(void *) const = 0;
ramelg018a164882022-04-07 02:42:52 +010079
80 /* Compute a portion of the output tensor with padding. */
81 virtual void compute_tile_padded(
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +000082 const DepthwiseArgs &args,
ramelg018a164882022-04-07 02:42:52 +010083 unsigned int output_i, unsigned int output_j,
84 unsigned int output_channel_start, unsigned int output_channel_end,
85 const TensorSpec<const TInput *> &input,
86 const TensorSpec<TOutput *> &output,
87 const void *parameters,
88 void *working_space
89 ) const = 0;
90
91 /* Compute a portion of the work with only top/bottom padding.
92 *
93 * The default implementation of this repeatedly calls into the padded tile
94 * variant.
95 */
96 virtual void compute_row_padded_tile_row(
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +000097 const DepthwiseArgs &args,
ramelg018a164882022-04-07 02:42:52 +010098 const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,
99 const unsigned int output_channel_start, const unsigned int output_channel_end,
100 const TensorSpec<const TInput *> &input,
101 const TensorSpec<TOutput *> &output,
102 const void *parameters,
103 void *working_space
104 ) const
105 {
106 for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())
107 {
108 this->compute_tile_padded(
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000109 args,
ramelg018a164882022-04-07 02:42:52 +0100110 output_i, output_j, output_channel_start, output_channel_end,
111 input, output, parameters, working_space
112 );
113 }
114 }
115
116 /* Compute a portion of the output tensor with no padding.
117 *
118 * The default implementation of this repeatedly calls into the padded
119 * variant.
120 */
121 virtual void compute_tiles_unpadded(
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000122 const DepthwiseArgs &args,
ramelg018a164882022-04-07 02:42:52 +0100123 unsigned int start_output_i, unsigned int start_output_j,
124 unsigned int n_tile_rows, unsigned int n_tile_cols,
125 unsigned int output_channel_start, unsigned int output_channel_end,
126 const TensorSpec<const TInput *> &input,
127 const TensorSpec<TOutput *> &output,
128 const void *parameters,
129 void *working_space
130 ) const
131 {
132 for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)
133 {
134 unsigned int row_start_output_j = start_output_j;
135 for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)
136 {
137 this->compute_tile_padded(
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000138 args,
ramelg018a164882022-04-07 02:42:52 +0100139 start_output_i, row_start_output_j,
140 output_channel_start, output_channel_end,
141 input, output, parameters, working_space
142 );
143 row_start_output_j += m_strat->get_output_cols();
144 }
145 start_output_i += m_strat->get_output_rows();
146 }
147 }
148
149 void execute_internal(
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000150 const DepthwiseArgs &args,
ramelg018a164882022-04-07 02:42:52 +0100151 const void *input,
152 size_t ld_input_col,
153 size_t ld_input_row,
154 size_t ld_input_batch,
155 const void *parameters,
ramelg018a164882022-04-07 02:42:52 +0100156 void *output,
157 size_t ld_output_col,
158 size_t ld_output_row,
159 size_t ld_output_batch,
160 void *working_space,
161 unsigned int thread_id,
162 unsigned int n_threads
163 ) const override
164 {
165 // Get and initialise the working space for this thread.
166 void *thread_working_space =
Michael Tyler8deee9b2023-06-30 11:26:05 +0100167 static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
168 this->initialise_working_space(thread_working_space);
ramelg018a164882022-04-07 02:42:52 +0100169
170 // Construct convenient representations of the input/output tensors.
171 TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);
172 TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col);
173
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000174 const auto n_output_channels = args.input_channels * args.channel_multiplier;
ramelg018a164882022-04-07 02:42:52 +0100175
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000176 for (unsigned int batch = 0; batch < args.n_batches; batch++)
ramelg018a164882022-04-07 02:42:52 +0100177 {
178 // Iterate over rows of the output tensor; we stripe over the tiles.
179 for (unsigned int start_output_i = thread_id * m_strat->get_output_rows();
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000180 start_output_i < args.output_rows;
ramelg018a164882022-04-07 02:42:52 +0100181 start_output_i += n_threads * m_strat->get_output_rows())
182 {
183 // Determine what (if any padding) is required on the top/bottom of
184 // this row of the convolution.
185 const auto end_output_i = start_output_i + m_strat->get_output_rows();
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000186 const bool pad_output_bottom = args.output_rows < end_output_i;
ramelg018a164882022-04-07 02:42:52 +0100187
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000188 const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
ramelg018a164882022-04-07 02:42:52 +0100189 const bool pad_input_top = start_input_i < 0;
190 const int end_input_i = start_input_i + m_strat->get_input_rows();
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000191 const bool pad_input_bottom = static_cast<int>(args.input_rows) < end_input_i;
Michael Tyler8deee9b2023-06-30 11:26:05 +0100192 // We only need to account for input padding if direct padding is not supported.
193 const bool pad_row = ((pad_input_top || pad_input_bottom) && !this->supports_direct_padding())
194 || pad_output_bottom;
ramelg018a164882022-04-07 02:42:52 +0100195
196 // Iterate over the columns of the output tensor; we attempt to grab as
197 // much as possible of the unpadded regions, so the loop structure is a
198 // bit odd.
199 unsigned int start_output_j = 0;
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000200 while (start_output_j < args.output_cols)
ramelg018a164882022-04-07 02:42:52 +0100201 {
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000202 const int start_in_j = start_output_j * args.stride_cols - args.padding.left;
ramelg018a164882022-04-07 02:42:52 +0100203 const bool pad_input_left = start_in_j < 0;
204
205 // Determine if we can process a number of unpadded tiles in one go.
206 int n_unpadded_tiles = 0;
Michael Tyler8deee9b2023-06-30 11:26:05 +0100207 if ((!pad_input_left) || this->supports_direct_padding())
ramelg018a164882022-04-07 02:42:52 +0100208 {
209 // Determine the maximum number of tiles we could handle.
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000210 n_unpadded_tiles = (args.output_cols - start_output_j) / m_strat->get_output_cols();
ramelg018a164882022-04-07 02:42:52 +0100211
212 // Handle padding on the right hand edge
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000213 const int tile_stride = m_strat->get_output_cols() * args.stride_cols;
ramelg018a164882022-04-07 02:42:52 +0100214 int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();
215 int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;
216
217 while (n_unpadded_tiles > 0 &&
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000218 (static_cast<int>(args.output_cols) < end_output_j ||
219 static_cast<int>(args.input_cols) < end_input_j))
ramelg018a164882022-04-07 02:42:52 +0100220 {
221 n_unpadded_tiles--;
222 end_output_j -= m_strat->get_output_cols();
223 end_input_j -= tile_stride;
224 }
225 }
226
227 // Process unpadded tiles, if possible, otherwise process a padded tile.
228 if (n_unpadded_tiles)
229 {
230 if (!pad_row)
231 {
232 // Completely unpadded execution
233 this->compute_tiles_unpadded(
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000234 args,
ramelg018a164882022-04-07 02:42:52 +0100235 start_output_i, start_output_j,
236 1, n_unpadded_tiles, // Compute a row of unpadded tiles
237 0, n_output_channels, // Compute all channels
238 input_tensor, output_tensor, parameters, thread_working_space
239 );
240 }
241 else
242 {
243 // Top/bottom padding only
244 this->compute_row_padded_tile_row(
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000245 args,
ramelg018a164882022-04-07 02:42:52 +0100246 start_output_i, start_output_j, n_unpadded_tiles,
247 0, n_output_channels, // Compute all channels
248 input_tensor, output_tensor, parameters, thread_working_space
249 );
250 }
251 start_output_j += n_unpadded_tiles * m_strat->get_output_cols();
252 }
253 else
254 {
255 this->compute_tile_padded(
Pablo Marquez Tello4e2bbbb2023-01-09 17:21:01 +0000256 args,
ramelg018a164882022-04-07 02:42:52 +0100257 start_output_i, start_output_j,
258 0, n_output_channels, // Compute all channels
259 input_tensor, output_tensor, parameters, thread_working_space
260 );
261 start_output_j += m_strat->get_output_cols();
262 }
263 }
264 }
265
266 // Progress the pointers for the next batch.
267 input_tensor.base += ld_input_batch;
268 output_tensor.base += ld_output_batch;
269 }
270 }
271
272 public:
273 DepthfirstDriver(IDepthfirstStrategy *strategy, const DepthwiseArgs &args)
274 : Parent(args), m_strat(strategy)
275 {
276 }
277
Michael Tyler8deee9b2023-06-30 11:26:05 +0100278 size_t get_working_size(unsigned int n_threads) const override final
ramelg018a164882022-04-07 02:42:52 +0100279 {
Michael Tyler8deee9b2023-06-30 11:26:05 +0100280 return n_threads * this->get_working_size_per_thread();
281 }
282
283 virtual bool supports_direct_padding() const
284 {
285 return false;
ramelg018a164882022-04-07 02:42:52 +0100286 }
287};
288
289} // namespace depthwise
290} // namespace arm_conv