blob: 87d2bfd8e601a007bce38a5458d5f197f652f903 [file] [log] [blame]
Georgios Pinitas20c246a2018-09-12 16:45:53 +01001/*
Georgios Pinitas47d39dc2019-03-11 14:03:23 +00002 * Copyright (c) 2018-2019 ARM Limited.
Georgios Pinitas20c246a2018-09-12 16:45:53 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25/*
26 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
27 *
28 * NOTE: Header to be included by implementation files only.
29 *
30 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
31 */
32#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Georgios Pinitas30271c72019-06-24 14:56:34 +010033#include "arm.hpp"
34#include "impl_base.hpp"
Georgios Pinitas20c246a2018-09-12 16:45:53 +010035
36#pragma once
37
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000038using namespace neon_convolution_kernels;
39
Georgios Pinitas20c246a2018-09-12 16:45:53 +010040namespace depthwise
41{
Georgios Pinitas20c246a2018-09-12 16:45:53 +010042
Georgios Pinitas20c246a2018-09-12 16:45:53 +010043template <
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000044 unsigned int OutputTileRows, unsigned int OutputTileCols,
45 unsigned int KernelRows, unsigned int KernelCols,
46 unsigned int StrideRows, unsigned int StrideCols
Georgios Pinitas20c246a2018-09-12 16:45:53 +010047>
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000048DepthwiseConvolution<
49 OutputTileRows, OutputTileCols,
50 KernelRows, KernelCols, StrideRows, StrideCols,
51 float16_t, float16_t, float16_t
52>::DepthwiseConvolution(
53 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
54 ActivationFunction activation,
55 unsigned int padding_top,
56 unsigned int padding_left,
57 unsigned int padding_bottom,
58 unsigned int padding_right
59) : Base(
60 n_batches, n_input_rows, n_input_cols, n_channels, activation,
61 padding_top, padding_left, padding_bottom, padding_right
62 )
63{
64}
65
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000066template <
67 unsigned int OutputTileRows, unsigned int OutputTileCols,
68 unsigned int KernelRows, unsigned int KernelCols,
69 unsigned int StrideRows, unsigned int StrideCols
70>
Georgios Pinitas30271c72019-06-24 14:56:34 +010071DepthwiseConvolution<
72 OutputTileRows, OutputTileCols,
73 KernelRows, KernelCols, StrideRows, StrideCols,
74 float16_t, float16_t, float16_t
75>::DepthwiseConvolution(
76 int n_batches, int n_input_rows, int n_input_cols, int n_channels,
77 int n_output_rows, int n_output_cols,
78 ActivationFunction activation,
79 unsigned int padding_top,
80 unsigned int padding_left,
81 unsigned int padding_bottom,
82 unsigned int padding_right
83) : Base(
84 n_batches, n_input_rows, n_input_cols, n_channels,
85 n_output_rows, n_output_cols, activation,
86 padding_top, padding_left, padding_bottom, padding_right
87 )
88{
89}
90
91template <
92 unsigned int OutputTileRows, unsigned int OutputTileCols,
93 unsigned int KernelRows, unsigned int KernelCols,
94 unsigned int StrideRows, unsigned int StrideCols
95>
Georgios Pinitas47d39dc2019-03-11 14:03:23 +000096template <ActivationFunction Activation>
97void DepthwiseConvolution<
98 OutputTileRows, OutputTileCols,
99 KernelRows, KernelCols, StrideRows, StrideCols,
100 float16_t, float16_t, float16_t
101>::execute_tile(
102 int n_channels,
103 const void *weights_biases_ptr,
104 const float16_t *input,
105 const unsigned int in_row_stride,
106 const unsigned int in_col_stride,
107 float16_t *output,
108 const unsigned int out_row_stride,
109 const unsigned int out_col_stride
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100110)
111{
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100112 // Instantiate pointers
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000113 const float16_t* __restrict__ inptr_base = input;
114 float16_t* __restrict__ outptr_base = output;
115 const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100116
117 // Perform the depthwise convolution
118 int channels_remaining = n_channels;
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100119 for (; channels_remaining >= 8; channels_remaining -= 8)
120 {
121 // Load input tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000122 float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
123 for (int i = 0; i < Base::inner_tile_rows; i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100124 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000125 const float16_t* const inptr_row = inptr_base + i*in_row_stride;
126 for (int j = 0; j < Base::inner_tile_cols; j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100127 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000128 u[i][j] = vld1q_f16(inptr_row + j*in_col_stride);
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100129 }
130 }
131 inptr_base += 8;
132
133 // Load weights tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000134 float16x8_t vbias = vld1q_f16(params);
135 params += 8;
136
137 float16x8_t w[KernelRows][KernelCols];
138 for (unsigned int i = 0; i < KernelRows; i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100139 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000140 for (unsigned int j = 0; j < KernelCols; j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100141 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000142 w[i][j] = vld1q_f16(params);
143 params += 8;
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100144 }
145 }
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100146
147 // Perform the convolution
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000148 float16x8_t v[OutputTileRows][OutputTileCols];
149 for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100150 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000151 for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100152 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000153 v[out_i][out_j] = vbias;
154
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100155 // Base co-ordinate
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000156 const int base_i = out_i * StrideRows;
157 const int base_j = out_j * StrideCols;
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100158
159 // Fill the accumulator
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000160 for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100161 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000162 const unsigned int i = base_i + in_i;
163 for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100164 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000165 const unsigned int j = base_j + in_j;
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100166
167 // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000168 v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100169 }
170 }
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000171
172 // Apply the activation function
173 if (Activation == ActivationFunction::ReLU ||
174 Activation == ActivationFunction::ReLU6)
175 {
176 v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
177 }
178 if (Activation == ActivationFunction::ReLU6)
179 {
180 v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
181 }
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100182 }
183 }
184
185 // Store the output tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000186 for (unsigned int i = 0; i < OutputTileRows; i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100187 {
188 float16_t* const outptr_row = outptr_base + i*out_row_stride;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000189 for (unsigned int j = 0; j < OutputTileCols; j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100190 {
191 vst1q_f16(outptr_row + j*out_col_stride, v[i][j]);
192 }
193 }
194 outptr_base += 8;
195 }
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100196 for (; channels_remaining; channels_remaining--)
197 {
198 // Load input tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000199 float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
200 for (int i = 0; i < Base::inner_tile_rows; i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100201 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000202 const float16_t* const inptr_row = inptr_base + i*in_row_stride;
203 for (int j = 0; j < Base::inner_tile_cols; j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100204 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000205 u[i][j] = *(inptr_row + j*in_col_stride);
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100206 }
207 }
208 inptr_base++;
209
210 // Load weights tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000211 float16_t bias = *(params++);
212 float16_t w[KernelRows][KernelCols];
213 for (unsigned int i = 0; i < KernelRows; i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100214 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000215 for (unsigned int j = 0; j < KernelCols; j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100216 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000217 w[i][j] = *(params++);
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100218 }
219 }
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100220
221 // Perform the convolution
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000222 float16_t v[OutputTileRows][OutputTileCols];
223 for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100224 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000225 for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100226 {
227 // Clear the accumulator
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000228 v[out_i][out_j] = bias;
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100229
230 // Base co-ordinate
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000231 const int base_i = out_i * StrideRows;
232 const int base_j = out_j * StrideCols;
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100233
234 // Fill the accumulator
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000235 for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100236 {
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000237 const unsigned int i = base_i + in_i;
238 for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100239 {
240 const int j = base_j + in_j;
241 v[out_i][out_j] += w[in_i][in_j] * u[i][j];
242 }
243 }
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000244
245 // Apply the activation function
246 if (Activation == ActivationFunction::ReLU ||
247 Activation == ActivationFunction::ReLU6)
248 {
249 v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
250 }
251 if (Activation == ActivationFunction::ReLU6)
252 {
253 v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
254 }
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100255 }
256 }
257
258 // Store the output tile
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000259 for (unsigned int i = 0; i < OutputTileRows; i++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100260 {
261 float16_t* const outptr_row = outptr_base + i*out_row_stride;
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000262 for (unsigned int j = 0; j < OutputTileCols; j++)
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100263 {
264 *(outptr_row + j*out_col_stride) = v[i][j];
265 }
266 }
267 outptr_base++;
268 }
269}
Georgios Pinitas47d39dc2019-03-11 14:03:23 +0000270
Georgios Pinitasa4bba9c2019-04-02 15:27:52 +0100271template <
272 unsigned int OutputTileRows, unsigned int OutputTileCols,
273 unsigned int KernelRows, unsigned int KernelCols,
274 unsigned int StrideRows, unsigned int StrideCols
275>
276template <ActivationFunction Activation>
277void DepthwiseConvolution<
278 OutputTileRows, OutputTileCols,
279 KernelRows, KernelCols, StrideRows, StrideCols,
280 float16_t, float16_t, float16_t
281>::execute_tile(
282 int n_channels,
283 const void *weights_biases_ptr,
284 const float16_t * inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
285 float16_t *outptrs[Base::output_tile_rows][Base::output_tile_cols]
286)
287{
288 // Instantiate pointers
289 const float16_t* __restrict__ params = static_cast<const float16_t*>(weights_biases_ptr);
290 int n = 0;
291
292 // Perform the depthwise convolution
293 int channels_remaining = n_channels;
294 for (; channels_remaining >= 8; channels_remaining -= 8, n += 8)
295 {
296 // Load input tile
297 float16x8_t u[Base::inner_tile_rows][Base::inner_tile_cols];
298 for (int i = 0; i < Base::inner_tile_rows; i++)
299 {
300 for (int j = 0; j < Base::inner_tile_cols; j++)
301 {
302 u[i][j] = vld1q_f16(inptrs[i][j] + n);
303 }
304 }
305
306 // Load weights tile
307 float16x8_t vbias = vld1q_f16(params);
308 params += 8;
309
310 float16x8_t w[KernelRows][KernelCols];
311 for (unsigned int i = 0; i < KernelRows; i++)
312 {
313 for (unsigned int j = 0; j < KernelCols; j++)
314 {
315 w[i][j] = vld1q_f16(params);
316 params += 8;
317 }
318 }
319
320 // Perform the convolution
321 float16x8_t v[OutputTileRows][OutputTileCols];
322 for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
323 {
324 for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
325 {
326 v[out_i][out_j] = vbias;
327
328 // Base co-ordinate
329 const int base_i = out_i * StrideRows;
330 const int base_j = out_j * StrideCols;
331
332 // Fill the accumulator
333 for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
334 {
335 const unsigned int i = base_i + in_i;
336 for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
337 {
338 const unsigned int j = base_j + in_j;
339
340 // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
341 v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
342 }
343 }
344
345 // Apply the activation function
346 if (Activation == ActivationFunction::ReLU ||
347 Activation == ActivationFunction::ReLU6)
348 {
349 v[out_i][out_j] = vmaxq_f16(v[out_i][out_j], vdupq_n_f16(0.0f));
350 }
351 if (Activation == ActivationFunction::ReLU6)
352 {
353 v[out_i][out_j] = vminq_f16(v[out_i][out_j], vdupq_n_f16(6.0f));
354 }
355 }
356 }
357
358 // Store the output tile
359 for (unsigned int i = 0; i < OutputTileRows; i++)
360 {
361 for (unsigned int j = 0; j < OutputTileCols; j++)
362 {
363 vst1q_f16(outptrs[i][j] + n, v[i][j]);
364 }
365 }
366 }
367 for (; channels_remaining; channels_remaining--, n++)
368 {
369 // Load input tile
370 float16_t u[Base::inner_tile_rows][Base::inner_tile_cols];
371 for (int i = 0; i < Base::inner_tile_rows; i++)
372 {
373 for (int j = 0; j < Base::inner_tile_cols; j++)
374 {
375 u[i][j] = *(inptrs[i][j] + n);
376 }
377 }
378
379 // Load weights tile
380 float16_t bias = *(params++);
381 float16_t w[KernelRows][KernelCols];
382 for (unsigned int i = 0; i < KernelRows; i++)
383 {
384 for (unsigned int j = 0; j < KernelCols; j++)
385 {
386 w[i][j] = *(params++);
387 }
388 }
389
390 // Perform the convolution
391 float16_t v[OutputTileRows][OutputTileCols];
392 for (unsigned int out_i = 0; out_i < OutputTileRows; out_i++)
393 {
394 for (unsigned int out_j = 0; out_j < OutputTileCols; out_j++)
395 {
396 // Clear the accumulator
397 v[out_i][out_j] = bias;
398
399 // Base co-ordinate
400 const int base_i = out_i * StrideRows;
401 const int base_j = out_j * StrideCols;
402
403 // Fill the accumulator
404 for (unsigned int in_i = 0; in_i < KernelRows; in_i++)
405 {
406 const unsigned int i = base_i + in_i;
407 for (unsigned int in_j = 0; in_j < KernelCols; in_j++)
408 {
409 const int j = base_j + in_j;
410 v[out_i][out_j] += w[in_i][in_j] * u[i][j];
411 }
412 }
413
414 // Apply the activation function
415 if (Activation == ActivationFunction::ReLU ||
416 Activation == ActivationFunction::ReLU6)
417 {
418 v[out_i][out_j] = std::max<float16_t>(0.0f, v[out_i][out_j]);
419 }
420 if (Activation == ActivationFunction::ReLU6)
421 {
422 v[out_i][out_j] = std::min<float16_t>(6.0f, v[out_i][out_j]);
423 }
424 }
425 }
426
427 // Store the output tile
428 for (unsigned int i = 0; i < OutputTileRows; i++)
429 {
430 for (unsigned int j = 0; j < OutputTileCols; j++)
431 {
432 *(outptrs[i][j] + n) = v[i][j];
433 }
434 }
435 }
436}
437
Georgios Pinitas20c246a2018-09-12 16:45:53 +0100438} // namespace depthwise
439#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC