blob: cbdb19a06718008e1bdbb89f19008ab26c7905ea [file] [log] [blame]
Georgios Pinitas20c246a2018-09-12 16:45:53 +01001/*
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00002 * Copyright (c) 2018-2019 ARM Limited.
Georgios Pinitas20c246a2018-09-12 16:45:53 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25/*
26 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
27 *
28 * NOTE: Header to be included by implementation files only.
29 *
30 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
31 */
32#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
33#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
34#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
35
36#pragma once
37
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000038using namespace neon_convolution_kernels;
39
Georgios Pinitas20c246a2018-09-12 16:45:53 +010040namespace depthwise
41{
Georgios Pinitas20c246a2018-09-12 16:45:53 +010042
Georgios Pinitas20c246a2018-09-12 16:45:53 +010043template <
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000044 unsigned int OutputTileRows, unsigned int OutputTileCols,
45 unsigned int KernelRows, unsigned int KernelCols,
46 unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas20c246a2018-09-12 16:45:53 +010047>
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000048DepthwiseConvolution<
49 OutputTileRows, OutputTileCols,
50 KernelRows, KernelCols, StrideRows, StrideCols,
51 float16_t, float16_t, float16_t
52>::DepthwiseConvolution(
53 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
54 ActivationFunction activation,
55 unsigned int padding_top,
56 unsigned int padding_left,
57 unsigned int padding_bottom,
58 unsigned int padding_right
59) : Base(
60 n_batches, n_input_rows, n_input_cols, n_channels, activation,
61 padding_top, padding_left, padding_bottom, padding_right
62 )
63{
64}
65
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000066template <
67 unsigned int OutputTileRows, unsigned int OutputTileCols,
68 unsigned int KernelRows, unsigned int KernelCols,
69 unsigned int StrideRows, unsigned int StrideCols
70>
71template <ActivationFunction Activation>
72void DepthwiseConvolution<
73 OutputTileRows, OutputTileCols,
74 KernelRows, KernelCols, StrideRows, StrideCols,
75 float16_t, float16_t, float16_t
76>::execute_tile(
77 int n_channels,
78 const void *weights_biases_ptr,
79 const float16_t *input,
80 const unsigned int in_row_stride,
81 const unsigned int in_col_stride,
82 float16_t *output,
83 const unsigned int out_row_stride,
84 const unsigned int out_col_stride
Georgios Pinitas20c246a2018-09-12 16:45:53 +010085)
86{
Georgios Pinitas20c246a2018-09-12 16:45:53 +010087 // Instantiate pointers
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000088 const float16_t* __restrict__ inptr_base = input;
89 float16_t* __restrict__ outptr_base = output;
90 const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
Georgios Pinitas20c246a2018-09-12 16:45:53 +010091
92 // Perform the depthwise convolution
93 int channels_remaining = n_channels;
Georgios Pinitas20c246a2018-09-12 16:45:53 +010094 for (; channels_remaining >= 8; channels_remaining -= 8)
95 {
96 // Load input tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000097 float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
98 for (int i = 0; i < Base::inner_tile_rows; i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +010099 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000100 const float16_t* const inptr_row = inptr_base + i*in_row_stride;
101 for (int j = 0; j < Base::inner_tile_cols; j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100102 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000103 u[i][j] = vld1q_f16(inptr_row + j*in_col_stride);
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100104 }
105 }
106 inptr_base += 8;
107
108 // Load weights tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000109 float16x8_t vbias = vld1q_f16(params);
110 params += 8;
111
112 float16x8_t w[KernelRows][KernelCols];
113 for (unsigned int i = 0; i < KernelRows; i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100114 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000115 for (unsigned int j = 0; j < KernelCols; j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100116 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000117 w[i][j] = vld1q_f16(params);
118 params += 8;
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100119 }
120 }
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100121
122 // Perform the convolution
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000123 float16x8_t v[OutputTileRows][OutputTileCols];
124 for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100125 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000126 for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100127 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000128 v[out_i][out_j] = vbias;
129
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100130 // Base co-ordinate
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000131 const int base_i = out_i * StrideRows;
132 const int base_j = out_j * StrideCols;
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100133
134 // Fill the accumulator
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000135 for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100136 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000137 const unsigned int i = base_i + in_i;
138 for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100139 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000140 const unsigned int j = base_j + in_j;
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100141
142 // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000143 v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100144 }
145 }
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000146
147 // Apply the activation function
148 if (Activation == ActivationFunction::ReLU ||
149 Activation == ActivationFunction::ReLU6)
150 {
151 v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
152 }
153 if (Activation == ActivationFunction::ReLU6)
154 {
155 v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
156 }
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100157 }
158 }
159
160 // Store the output tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000161 for (unsigned int i = 0; i < OutputTileRows; i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100162 {
163 float16_t* const outptr_row = outptr_base + i*out_row_stride;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000164 for (unsigned int j = 0; j < OutputTileCols; j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100165 {
166 vst1q_f16(outptr_row + j*out_col_stride, v[i][j]);
167 }
168 }
169 outptr_base += 8;
170 }
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100171 for (; channels_remaining; channels_remaining--)
172 {
173 // Load input tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000174 float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
175 for (int i = 0; i < Base::inner_tile_rows; i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100176 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000177 const float16_t* const inptr_row = inptr_base + i*in_row_stride;
178 for (int j = 0; j < Base::inner_tile_cols; j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100179 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000180 u[i][j] = *(inptr_row + j*in_col_stride);
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100181 }
182 }
183 inptr_base++;
184
185 // Load weights tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000186 float16_t bias = *(params++);
187 float16_t w[KernelRows][KernelCols];
188 for (unsigned int i = 0; i < KernelRows; i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100189 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000190 for (unsigned int j = 0; j < KernelCols; j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100191 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000192 w[i][j] = *(params++);
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100193 }
194 }
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100195
196 // Perform the convolution
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000197 float16_t v[OutputTileRows][OutputTileCols];
198 for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100199 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000200 for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100201 {
202 // Clear the accumulator
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000203 v[out_i][out_j] = bias;
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100204
205 // Base co-ordinate
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000206 const int base_i = out_i * StrideRows;
207 const int base_j = out_j * StrideCols;
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100208
209 // Fill the accumulator
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000210 for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100211 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000212 const unsigned int i = base_i + in_i;
213 for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100214 {
215 const int j = base_j + in_j;
216 v[out_i][out_j] += w[in_i][in_j] * u[i][j];
217 }
218 }
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000219
220 // Apply the activation function
221 if (Activation == ActivationFunction::ReLU ||
222 Activation == ActivationFunction::ReLU6)
223 {
224 v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
225 }
226 if (Activation == ActivationFunction::ReLU6)
227 {
228 v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
229 }
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100230 }
231 }
232
233 // Store the output tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000234 for (unsigned int i = 0; i < OutputTileRows; i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100235 {
236 float16_t* const outptr_row = outptr_base + i*out_row_stride;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000237 for (unsigned int j = 0; j < OutputTileCols; j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100238 {
239 *(outptr_row + j*out_col_stride) = v[i][j];
240 }
241 }
242 outptr_base++;
243 }
244}
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000245
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100246template <
247 unsigned int OutputTileRows, unsigned int OutputTileCols,
248 unsigned int KernelRows, unsigned int KernelCols,
249 unsigned int StrideRows, unsigned int StrideCols
250>
251template <ActivationFunction Activation>
252void DepthwiseConvolution<
253 OutputTileRows, OutputTileCols,
254 KernelRows, KernelCols, StrideRows, StrideCols,
255 float16_t, float16_t, float16_t
256>::execute_tile(
257 int n_channels,
258 const void *weights_biases_ptr,
259 const float16_t * inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
260 float16_t *outptrs[Base::output_tile_rows][Base::output_tile_cols]
261)
262{
263 // Instantiate pointers
264 const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
265 int n = 0;
266
267 // Perform the depthwise convolution
268 int channels_remaining = n_channels;
269 for (; channels_remaining >= 8; channels_remaining -= 8, n += 8)
270 {
271 // Load input tile
272 float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
273 for (int i = 0; i < Base::inner_tile_rows; i++)
274 {
275 for (int j = 0; j < Base::inner_tile_cols; j++)
276 {
277 u[i][j] = vld1q_f16(inptrs[i][j] + n);
278 }
279 }
280
281 // Load weights tile
282 float16x8_t vbias = vld1q_f16(params);
283 params += 8;
284
285 float16x8_t w[KernelRows][KernelCols];
286 for (unsigned int i = 0; i < KernelRows; i++)
287 {
288 for (unsigned int j = 0; j < KernelCols; j++)
289 {
290 w[i][j] = vld1q_f16(params);
291 params += 8;
292 }
293 }
294
295 // Perform the convolution
296 float16x8_t v[OutputTileRows][OutputTileCols];
297 for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
298 {
299 for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
300 {
301 v[out_i][out_j] = vbias;
302
303 // Base co-ordinate
304 const int base_i = out_i * StrideRows;
305 const int base_j = out_j * StrideCols;
306
307 // Fill the accumulator
308 for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
309 {
310 const unsigned int i = base_i + in_i;
311 for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
312 {
313 const unsigned int j = base_j + in_j;
314
315 // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
316 v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
317 }
318 }
319
320 // Apply the activation function
321 if (Activation == ActivationFunction::ReLU ||
322 Activation == ActivationFunction::ReLU6)
323 {
324 v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
325 }
326 if (Activation == ActivationFunction::ReLU6)
327 {
328 v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
329 }
330 }
331 }
332
333 // Store the output tile
334 for (unsigned int i = 0; i < OutputTileRows; i++)
335 {
336 for (unsigned int j = 0; j < OutputTileCols; j++)
337 {
338 vst1q_f16(outptrs[i][j] + n, v[i][j]);
339 }
340 }
341 }
342 for (; channels_remaining; channels_remaining--, n++)
343 {
344 // Load input tile
345 float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
346 for (int i = 0; i < Base::inner_tile_rows; i++)
347 {
348 for (int j = 0; j < Base::inner_tile_cols; j++)
349 {
350 u[i][j] = *(inptrs[i][j] + n);
351 }
352 }
353
354 // Load weights tile
355 float16_t bias = *(params++);
356 float16_t w[KernelRows][KernelCols];
357 for (unsigned int i = 0; i < KernelRows; i++)
358 {
359 for (unsigned int j = 0; j < KernelCols; j++)
360 {
361 w[i][j] = *(params++);
362 }
363 }
364
365 // Perform the convolution
366 float16_t v[OutputTileRows][OutputTileCols];
367 for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
368 {
369 for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
370 {
371 // Clear the accumulator
372 v[out_i][out_j] = bias;
373
374 // Base co-ordinate
375 const int base_i = out_i * StrideRows;
376 const int base_j = out_j * StrideCols;
377
378 // Fill the accumulator
379 for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
380 {
381 const unsigned int i = base_i + in_i;
382 for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
383 {
384 const int j = base_j + in_j;
385 v[out_i][out_j] += w[in_i][in_j] * u[i][j];
386 }
387 }
388
389 // Apply the activation function
390 if (Activation == ActivationFunction::ReLU ||
391 Activation == ActivationFunction::ReLU6)
392 {
393 v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
394 }
395 if (Activation == ActivationFunction::ReLU6)
396 {
397 v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
398 }
399 }
400 }
401
402 // Store the output tile
403 for (unsigned int i = 0; i < OutputTileRows; i++)
404 {
405 for (unsigned int j = 0; j < OutputTileCols; j++)
406 {
407 *(outptrs[i][j] + n) = v[i][j];
408 }
409 }
410 }
411}
412
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100413} // namespace depthwise
414#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC