blob: ed4cfb86b93237b5951ea9cee03715fe1e2d8aaf [file] [log] [blame]
Georgios Pinitas20c246a2018-09-12 16:45:53 +01001/*
2 * Copyright (c) 2018 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25/*
26 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
27 *
28 * NOTE: Header to be included by implementation files only.
29 *
30 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
31 */
32#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
33#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
34#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
35
36#pragma once
37
38namespace depthwise
39{
40// Partial specialisation for FP16 to FP16
41template <int OutputTileRows, int OutputTileCols,
42 int KernelRows, int KernelCols,
43 int StrideRows, int StrideCols>
44struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, float16_t, float16_t>
45{
46 typedef DepthwiseConvolution<
47 OutputTileRows, OutputTileCols,
48 KernelRows, KernelCols,
49 StrideRows, StrideCols,
50 float16_t, float16_t
51 > DWC;
52
53 template <
54 bool Specialize=false, // Specialize (or not) the method
55 int InPadTop=0, // If specialized, top padding
56 int InPadLeft=0, // If specialized, left padding
57 int InPadBottom=0, // If specialized, bottom padding
58 int InPadRight=0, // If specialized, right padding
59 int OutPadBottom=0, // If specialized, bottom output padding
60 int OutPadRight=0 // If specialized, bottom right padding
61 >
62 static void process_tile(
63 const int n_channels,
64 const float16_t* const weights,
65 const int weight_row_stride,
66 const int weight_col_stride,
67 const float16_t* const inptr,
68 const int in_row_stride,
69 const int in_col_stride,
70 float16_t* const outptr,
71 const int out_row_stride,
72 const int out_col_stride,
73 const int in_pad_top=0,
74 const int in_pad_left=0,
75 const int in_pad_bottom=0,
76 const int in_pad_right=0,
77 const int out_pad_bottom=0,
78 const int out_pad_right=0
79 );
80};
81
82
83template <int OTR, int OTC, int KR, int KC, int SR, int SC>
84template <
85 bool Specialize,
86 int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
87 int OutPadBottom, int OutPadRight
88>
89void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float16_t, float16_t>::process_tile(
90 const int n_channels,
91 const float16_t *__restrict__ const weights,
92 const int weight_row_stride,
93 const int weight_col_stride,
94 const float16_t *__restrict__ const inptr,
95 const int in_row_stride,
96 const int in_col_stride,
97 float16_t *__restrict__ const outptr,
98 const int out_row_stride,
99 const int out_col_stride,
100 const int _in_pad_top,
101 const int _in_pad_left,
102 const int _in_pad_bottom,
103 const int _in_pad_right,
104 const int _out_pad_bottom,
105 const int _out_pad_right
106)
107{
108 constexpr auto inner_tile_rows = DWC::inner_tile_rows;
109 constexpr auto inner_tile_cols = DWC::inner_tile_cols;
110 constexpr auto kernel_rows = DWC::kernel_rows;
111 constexpr auto kernel_cols = DWC::kernel_cols;
112 constexpr auto output_tile_rows = DWC::output_tile_rows;
113 constexpr auto output_tile_cols = DWC::output_tile_cols;
114 constexpr auto stride_rows = DWC::stride_rows;
115 constexpr auto stride_cols = DWC::stride_cols;
116
117 // Extract parameters
118 const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
119 const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
120 const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
121 const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
122 const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
123 const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
124
125 // Compute valid ranges of the tile
126 const int in_cells_i = inner_tile_rows - in_pad_bottom;
127 const int in_cells_j = inner_tile_cols - in_pad_right;
128 const int out_cells_i = output_tile_rows - out_pad_bottom;
129 const int out_cells_j = output_tile_cols - out_pad_right;
130
131 // Instantiate pointers
132 const float16_t* __restrict__ inptr_base = inptr;
133 const float16_t* __restrict__ wptr_base = weights;
134 float16_t* __restrict__ outptr_base = outptr;
135
136 // Perform the depthwise convolution
137 int channels_remaining = n_channels;
138#ifdef __aarch64__
139 for (; channels_remaining >= 8; channels_remaining -= 8)
140 {
141 // Load input tile
142 float16x8_t u[inner_tile_rows][inner_tile_cols];
143 for (int i = 0; i < inner_tile_rows; i++)
144 {
145 const float16_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
146 for (int j = 0; j < inner_tile_cols; j++)
147 {
148 if (i < in_pad_top || in_cells_i <= i ||
149 j < in_pad_left || in_cells_j <= j)
150 {
151 u[i][j] = vdupq_n_f16(0.0f);
152 }
153 else
154 {
155 u[i][j] = vld1q_f16(inptr_row + (j - in_pad_left)*in_col_stride);
156 }
157 }
158 }
159 inptr_base += 8;
160
161 // Load weights tile
162 float16x8_t w[kernel_rows][kernel_cols];
163 for (int i = 0; i < kernel_rows; i++)
164 {
165 const float16_t* const wptr_row = wptr_base + i*weight_row_stride;
166 for (int j = 0; j < kernel_cols; j++)
167 {
168 w[i][j] = vld1q_f16(wptr_row + j*weight_col_stride);
169 }
170 }
171 wptr_base += 8;
172
173 // Perform the convolution
174 float16x8_t v[output_tile_rows][output_tile_cols];
175 for (int out_i = 0; out_i < out_cells_i; out_i++)
176 {
177 for (int out_j = 0; out_j < out_cells_j; out_j++)
178 {
179 // Base co-ordinate
180 const int base_i = out_i * stride_rows;
181 const int base_j = out_j * stride_cols;
182
183 // Fill the accumulator
184 for (int in_i = 0; in_i < kernel_rows; in_i++)
185 {
186 const int i = base_i + in_i;
187 for (int in_j = 0; in_j < kernel_cols; in_j++)
188 {
189 const int j = base_j + in_j;
190 if (in_i == 0 && in_j == 0)
191 {
192 // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
193 v[out_i][out_j] = vmulq_f16(w[in_i][in_j], u[i][j]);
194 }
195 else
196 {
197 // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
198 v[out_i][out_j] = vaddq_f16(v[out_i][out_j], vmulq_f16(w[in_i][in_j], u[i][j]));
199 }
200 }
201 }
202 }
203 }
204
205 // Store the output tile
206 for (int i = 0; i < out_cells_i; i++)
207 {
208 float16_t* const outptr_row = outptr_base + i*out_row_stride;
209 for (int j = 0; j < out_cells_j; j++)
210 {
211 vst1q_f16(outptr_row + j*out_col_stride, v[i][j]);
212 }
213 }
214 outptr_base += 8;
215 }
216#endif // __aarch64__
217 for (; channels_remaining; channels_remaining--)
218 {
219 // Load input tile
220 float16_t u[inner_tile_rows][inner_tile_cols];
221 for (int i = 0; i < inner_tile_rows; i++)
222 {
223 const float16_t* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
224 for (int j = 0; j < inner_tile_cols; j++)
225 {
226 if (i < in_pad_top || in_cells_i <= i ||
227 j < in_pad_left || in_cells_j <= j)
228 {
229 u[i][j] = static_cast<float16_t>(0);
230 }
231 else
232 {
233 u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
234 }
235 }
236 }
237 inptr_base++;
238
239 // Load weights tile
240 float16_t w[kernel_rows][kernel_cols];
241 for (int i = 0; i < kernel_rows; i++)
242 {
243 const float16_t* const wptr_row = wptr_base + i*weight_row_stride;
244 for (int j = 0; j < kernel_cols; j++)
245 {
246 w[i][j] = *(wptr_row + j*weight_col_stride);
247 }
248 }
249 wptr_base++;
250
251 // Perform the convolution
252 float16_t v[output_tile_rows][output_tile_cols];
253 for (int out_i = 0; out_i < out_cells_i; out_i++)
254 {
255 for (int out_j = 0; out_j < out_cells_j; out_j++)
256 {
257 // Clear the accumulator
258 v[out_i][out_j] = static_cast<float16_t>(0);
259
260 // Base co-ordinate
261 const int base_i = out_i * stride_rows;
262 const int base_j = out_j * stride_cols;
263
264 // Fill the accumulator
265 for (int in_i = 0; in_i < kernel_rows; in_i++)
266 {
267 const int i = base_i + in_i;
268 for (int in_j = 0; in_j < kernel_cols; in_j++)
269 {
270 const int j = base_j + in_j;
271 v[out_i][out_j] += w[in_i][in_j] * u[i][j];
272 }
273 }
274 }
275 }
276
277 // Store the output tile
278 for (int i = 0; i < out_cells_i; i++)
279 {
280 float16_t* const outptr_row = outptr_base + i*out_row_stride;
281 for (int j = 0; j < out_cells_j; j++)
282 {
283 *(outptr_row + j*out_col_stride) = v[i][j];
284 }
285 }
286 outptr_base++;
287 }
288}
289} // namespace depthwise
290#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC