blob: b0d8126a40e873aa022e877274bf3271fabc2f66 [file] [log] [blame]
Georgios Pinitas4074c992018-01-30 18:13:46 +00001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2018-2019 Arm Limited.
Georgios Pinitas4074c992018-01-30 18:13:46 +00003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25/*
26 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
27 *
28 * NOTE: Header to be included by implementation files only.
29 *
30 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
31 */
32
Georgios Pinitas30271c72019-06-24 14:56:34 +010033#include "arm.hpp"
34#include "impl_base.hpp"
Georgios Pinitas4074c992018-01-30 18:13:46 +000035
36#pragma once
37
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000038using namespace neon_convolution_kernels;
39
Georgios Pinitas4074c992018-01-30 18:13:46 +000040namespace depthwise
41{
Georgios Pinitas4074c992018-01-30 18:13:46 +000042
Georgios Pinitas4074c992018-01-30 18:13:46 +000043template <
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000044 unsigned int OutputTileRows, unsigned int OutputTileCols,
45 unsigned int KernelRows, unsigned int KernelCols,
46 unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas4074c992018-01-30 18:13:46 +000047>
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000048DepthwiseConvolution<
49 OutputTileRows, OutputTileCols,
50 KernelRows, KernelCols, StrideRows, StrideCols,
51 float, float, float
52>::DepthwiseConvolution(
53 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
54 ActivationFunction activation,
55 unsigned int padding_top,
56 unsigned int padding_left,
57 unsigned int padding_bottom,
58 unsigned int padding_right
59) : Base(
60 n_batches, n_input_rows, n_input_cols, n_channels, activation,
61 padding_top, padding_left, padding_bottom, padding_right
62 )
63{
64}
65
Georgios Pinitas30271c72019-06-24 14:56:34 +010066template <
67 unsigned int OutputTileRows, unsigned int OutputTileCols,
68 unsigned int KernelRows, unsigned int KernelCols,
69 unsigned int StrideRows, unsigned int StrideCols
70>
71DepthwiseConvolution<
72 OutputTileRows, OutputTileCols,
73 KernelRows, KernelCols, StrideRows, StrideCols,
74 float, float, float
75>::DepthwiseConvolution(
76 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
77 int n_output_rows, int n_output_cols,
78 ActivationFunction activation,
79 unsigned int padding_top,
80 unsigned int padding_left,
81 unsigned int padding_bottom,
82 unsigned int padding_right
83) : Base(
84 n_batches, n_input_rows, n_input_cols, n_channels,
85 n_output_rows, n_output_cols, activation,
86 padding_top, padding_left, padding_bottom, padding_right
87 )
88{
89}
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000090
91template <
92 unsigned int OutputTileRows, unsigned int OutputTileCols,
93 unsigned int KernelRows, unsigned int KernelCols,
94 unsigned int StrideRows, unsigned int StrideCols
95>
96template <ActivationFunction Activation>
97void DepthwiseConvolution<
98 OutputTileRows, OutputTileCols,
99 KernelRows, KernelCols, StrideRows, StrideCols,
100 float, float, float
101>::execute_tile(
102 int n_channels,
103 const void *weights_biases_ptr,
104 const float *input,
105 const unsigned int in_row_stride,
106 const unsigned int in_col_stride,
107 float *output,
108 const unsigned int out_row_stride,
109 const unsigned int out_col_stride
Georgios Pinitas4074c992018-01-30 18:13:46 +0000110)
111{
Georgios Pinitas4074c992018-01-30 18:13:46 +0000112 // Instantiate pointers
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000113 const float* __restrict__ inptr_base = input;
114 float* __restrict__ outptr_base = output;
115 const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
Georgios Pinitas4074c992018-01-30 18:13:46 +0000116
117 // Perform the depthwise convolution
118 int channels_remaining = n_channels;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000119 for (; channels_remaining >= 4; channels_remaining -= 4)
120 {
121 // Load input tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000122 float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
123 for (int i = 0; i < Base::inner_tile_rows; i++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000124 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000125 const float* const inptr_row = inptr_base + i*in_row_stride;
126 for (int j = 0; j < Base::inner_tile_cols; j++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000127 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000128 u[i][j] = vld1q_f32(inptr_row + j*in_col_stride);
Georgios Pinitas4074c992018-01-30 18:13:46 +0000129 }
130 }
131 inptr_base += 4;
132
133 // Load weights tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000134 float32x4_t vbias = vld1q_f32(params);
135 params += 4;
136
137 float32x4_t w[KernelRows][KernelCols];
138 for (unsigned int i = 0; i < KernelRows; i++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000139 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000140 for (unsigned int j = 0; j < KernelCols; j++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000141 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000142 w[i][j] = vld1q_f32(params);
143 params += 4;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000144 }
145 }
Georgios Pinitas4074c992018-01-30 18:13:46 +0000146
147 // Perform the convolution
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000148 float32x4_t v[OutputTileRows][OutputTileCols];
149 for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000150 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000151 for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000152 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000153 v[out_i][out_j] = vbias;
154
Georgios Pinitas4074c992018-01-30 18:13:46 +0000155 // Base co-ordinate
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000156 const int base_i = out_i * StrideRows;
157 const int base_j = out_j * StrideCols;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000158
159 // Fill the accumulator
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000160 for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000161 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000162 const unsigned int i = base_i + in_i;
163 for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000164 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000165 const unsigned int j = base_j + in_j;
166
167 // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
168 v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
Georgios Pinitas4074c992018-01-30 18:13:46 +0000169 }
170 }
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000171
172 // Apply the activation function
173 if (Activation == ActivationFunction::ReLU ||
174 Activation == ActivationFunction::ReLU6)
175 {
176 v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
177 }
178 if (Activation == ActivationFunction::ReLU6)
179 {
180 v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
181 }
Georgios Pinitas4074c992018-01-30 18:13:46 +0000182 }
183 }
184
185 // Store the output tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000186 for (unsigned int i = 0; i < OutputTileRows; i++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000187 {
188 float* const outptr_row = outptr_base + i*out_row_stride;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000189 for (unsigned int j = 0; j < OutputTileCols; j++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000190 {
191 vst1q_f32(outptr_row + j*out_col_stride, v[i][j]);
192 }
193 }
194 outptr_base += 4;
195 }
Georgios Pinitas4074c992018-01-30 18:13:46 +0000196 for (; channels_remaining; channels_remaining--)
197 {
198 // Load input tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000199 float u[Base::inner_tile_rows][Base::inner_tile_cols];
200 for (int i = 0; i < Base::inner_tile_rows; i++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000201 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000202 const float* const inptr_row = inptr_base + i*in_row_stride;
203 for (int j = 0; j < Base::inner_tile_cols; j++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000204 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000205 u[i][j] = *(inptr_row + j*in_col_stride);
Georgios Pinitas4074c992018-01-30 18:13:46 +0000206 }
207 }
208 inptr_base++;
209
210 // Load weights tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000211 float bias = *(params++);
212 float w[KernelRows][KernelCols];
213 for (unsigned int i = 0; i < KernelRows; i++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000214 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000215 for (unsigned int j = 0; j < KernelCols; j++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000216 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000217 w[i][j] = *(params++);
Georgios Pinitas4074c992018-01-30 18:13:46 +0000218 }
219 }
Georgios Pinitas4074c992018-01-30 18:13:46 +0000220
221 // Perform the convolution
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000222 float v[OutputTileRows][OutputTileCols];
223 for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000224 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000225 for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000226 {
227 // Clear the accumulator
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000228 v[out_i][out_j] = bias;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000229
230 // Base co-ordinate
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000231 const int base_i = out_i * StrideRows;
232 const int base_j = out_j * StrideCols;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000233
234 // Fill the accumulator
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000235 for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000236 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000237 const unsigned int i = base_i + in_i;
238 for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000239 {
240 const int j = base_j + in_j;
241 v[out_i][out_j] += w[in_i][in_j] * u[i][j];
242 }
243 }
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000244
245 // Apply the activation function
246 if (Activation == ActivationFunction::ReLU ||
247 Activation == ActivationFunction::ReLU6)
248 {
249 v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
250 }
251 if (Activation == ActivationFunction::ReLU6)
252 {
253 v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
254 }
Georgios Pinitas4074c992018-01-30 18:13:46 +0000255 }
256 }
257
258 // Store the output tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000259 for (unsigned int i = 0; i < OutputTileRows; i++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000260 {
261 float* const outptr_row = outptr_base + i*out_row_stride;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000262 for (unsigned int j = 0; j < OutputTileCols; j++)
Georgios Pinitas4074c992018-01-30 18:13:46 +0000263 {
264 *(outptr_row + j*out_col_stride) = v[i][j];
265 }
266 }
267 outptr_base++;
268 }
269}
270
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100271
272template <
273 unsigned int OutputTileRows, unsigned int OutputTileCols,
274 unsigned int KernelRows, unsigned int KernelCols,
275 unsigned int StrideRows, unsigned int StrideCols
276>
277template <ActivationFunction Activation>
278void DepthwiseConvolution<
279 OutputTileRows, OutputTileCols,
280 KernelRows, KernelCols, StrideRows, StrideCols,
281 float, float, float
282>::execute_tile(
283 int n_channels,
284 const void *weights_biases_ptr,
285 const float *inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
286 float *outptrs[Base::output_tile_rows][Base::output_tile_cols]
287)
288{
289 const float* __restrict__ params = static_cast<const float*>(weights_biases_ptr);
290
291 // Perform the depthwise convolution
292 int channels_remaining = n_channels;
293 int n = 0;
294 for (; channels_remaining >= 4; channels_remaining -= 4, n += 4)
295 {
296 // Load input tile
297 float32x4_t u[Base::inner_tile_rows][Base::inner_tile_cols];
298 for (int i = 0; i < Base::inner_tile_rows; i++)
299 {
300 for (int j = 0; j < Base::inner_tile_cols; j++)
301 {
302 u[i][j] = vld1q_f32(inptrs[i][j] + n);
303 }
304 }
305
306 // Load weights tile
307 float32x4_t vbias = vld1q_f32(params);
308 params += 4;
309
310 float32x4_t w[KernelRows][KernelCols];
311 for (unsigned int i = 0; i < KernelRows; i++)
312 {
313 for (unsigned int j = 0; j < KernelCols; j++)
314 {
315 w[i][j] = vld1q_f32(params);
316 params += 4;
317 }
318 }
319
320 // Perform the convolution
321 float32x4_t v[OutputTileRows][OutputTileCols];
322 for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
323 {
324 for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
325 {
326 v[out_i][out_j] = vbias;
327
328 // Base co-ordinate
329 const int base_i = out_i * StrideRows;
330 const int base_j = out_j * StrideCols;
331
332 // Fill the accumulator
333 for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
334 {
335 const unsigned int i = base_i + in_i;
336 for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
337 {
338 const unsigned int j = base_j + in_j;
339
340 // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
341 v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
342 }
343 }
344
345 // Apply the activation function
346 if (Activation == ActivationFunction::ReLU ||
347 Activation == ActivationFunction::ReLU6)
348 {
349 v[out_i][out_j] = vmaxq_f32(v[out_i][out_j], vdupq_n_f32(0.0f));
350 }
351 if (Activation == ActivationFunction::ReLU6)
352 {
353 v[out_i][out_j] = vminq_f32(v[out_i][out_j], vdupq_n_f32(6.0f));
354 }
355 }
356 }
357
358 // Store the output tile
359 for (unsigned int i = 0; i < OutputTileRows; i++)
360 {
361 for (unsigned int j = 0; j < OutputTileCols; j++)
362 {
363 vst1q_f32(outptrs[i][j] + n, v[i][j]);
364 }
365 }
366 }
367 for (; channels_remaining; channels_remaining--, n++)
368 {
369 // Load input tile
370 float u[Base::inner_tile_rows][Base::inner_tile_cols];
371 for (int i = 0; i < Base::inner_tile_rows; i++)
372 {
373 for (int j = 0; j < Base::inner_tile_cols; j++)
374 {
375 u[i][j] = *(inptrs[i][j] + n);
376 }
377 }
378
379 // Load weights tile
380 float bias = *(params++);
381 float w[KernelRows][KernelCols];
382 for (unsigned int i = 0; i < KernelRows; i++)
383 {
384 for (unsigned int j = 0; j < KernelCols; j++)
385 {
386 w[i][j] = *(params++);
387 }
388 }
389
390 // Perform the convolution
391 float v[OutputTileRows][OutputTileCols];
392 for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
393 {
394 for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
395 {
396 // Clear the accumulator
397 v[out_i][out_j] = bias;
398
399 // Base co-ordinate
400 const int base_i = out_i * StrideRows;
401 const int base_j = out_j * StrideCols;
402
403 // Fill the accumulator
404 for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
405 {
406 const unsigned int i = base_i + in_i;
407 for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
408 {
409 const int j = base_j + in_j;
410 v[out_i][out_j] += w[in_i][in_j] * u[i][j];
411 }
412 }
413
414 // Apply the activation function
415 if (Activation == ActivationFunction::ReLU ||
416 Activation == ActivationFunction::ReLU6)
417 {
418 v[out_i][out_j] = std::max(0.0f, v[out_i][out_j]);
419 }
420 if (Activation == ActivationFunction::ReLU6)
421 {
422 v[out_i][out_j] = std::min(6.0f, v[out_i][out_j]);
423 }
424 }
425 }
426
427 // Store the output tile
428 for (unsigned int i = 0; i < OutputTileRows; i++)
429 {
430 for (unsigned int j = 0; j < OutputTileCols; j++)
431 {
432 *(outptrs[i][j] + n) = v[i][j];
433 }
434 }
435 }
436}
437
Georgios Pinitas4074c992018-01-30 18:13:46 +0000438} // namespace depthwise