blob: 840086f917c11b618a204b3f4eb4ecef2d4aecfd [file] [log] [blame]
Georgios Pinitas4074c992018-01-30 18:13:46 +00001/*
2 * Copyright (c) 2018 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25/*
26 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
27 *
28 * NOTE: Header to be included by implementation files only.
29 *
30 * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
31 */
32
33#include "arm_compute/core/NEON/kernels/convolution/common/arm.hpp"
34#include "arm_compute/core/NEON/kernels/convolution/depthwise/impl_base.hpp"
35
36#pragma once
37
38namespace depthwise
39{
40// Partial specialisation for FP32 to FP32
41template <int OutputTileRows, int OutputTileCols,
42 int KernelRows, int KernelCols,
43 int StrideRows, int StrideCols>
44struct DepthwiseConvolutionImpl<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols, float, float>
45{
46 typedef DepthwiseConvolution<
47 OutputTileRows, OutputTileCols,
48 KernelRows, KernelCols,
49 StrideRows, StrideCols,
50 float, float
51 > DWC;
52
53 template <
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000054 bool Specialize=false, // Specialize (or not) the method
55 int InPadTop=0, // If specialized, top padding
56 int InPadLeft=0, // If specialized, left padding
57 int InPadBottom=0, // If specialized, bottom padding
58 int InPadRight=0, // If specialized, right padding
59 int OutPadBottom=0, // If specialized, bottom output padding
60 int OutPadRight=0 // If specialized, bottom right padding
Georgios Pinitas4074c992018-01-30 18:13:46 +000061 >
62 static void process_tile(
63 const int n_channels,
64 const float* const weights,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000065 const int weight_row_stride,
66 const int weight_col_stride,
Georgios Pinitas4074c992018-01-30 18:13:46 +000067 const float* const inptr,
68 const int in_row_stride,
69 const int in_col_stride,
70 float* const outptr,
71 const int out_row_stride,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000072 const int out_col_stride,
73 const int in_pad_top=0,
74 const int in_pad_left=0,
75 const int in_pad_bottom=0,
76 const int in_pad_right=0,
77 const int out_pad_bottom=0,
Georgios Pinitasa799ce02018-09-12 20:11:34 +010078 const int out_pad_right=0,
79 const int input_offset=0,
80 const int weights_offset=0
Georgios Pinitas4074c992018-01-30 18:13:46 +000081 );
82};
83
84
85template <int OTR, int OTC, int KR, int KC, int SR, int SC>
86template <
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000087 bool Specialize,
88 int InPadTop, int InPadLeft, int InPadBottom, int InPadRight,
89 int OutPadBottom, int OutPadRight
Georgios Pinitas4074c992018-01-30 18:13:46 +000090>
91void DepthwiseConvolutionImpl<OTR, OTC, KR, KC, SR, SC, float, float>::process_tile(
92 const int n_channels,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000093 const float *__restrict__ const weights,
94 const int weight_row_stride,
95 const int weight_col_stride,
96 const float *__restrict__ const inptr,
Georgios Pinitas4074c992018-01-30 18:13:46 +000097 const int in_row_stride,
98 const int in_col_stride,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +000099 float *__restrict__ const outptr,
Georgios Pinitas4074c992018-01-30 18:13:46 +0000100 const int out_row_stride,
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000101 const int out_col_stride,
102 const int _in_pad_top,
103 const int _in_pad_left,
104 const int _in_pad_bottom,
105 const int _in_pad_right,
106 const int _out_pad_bottom,
Georgios Pinitasa799ce02018-09-12 20:11:34 +0100107 const int _out_pad_right,
108 const int _input_offset,
109 const int _weights_offset
Georgios Pinitas4074c992018-01-30 18:13:46 +0000110)
111{
112 constexpr auto inner_tile_rows = DWC::inner_tile_rows;
113 constexpr auto inner_tile_cols = DWC::inner_tile_cols;
114 constexpr auto kernel_rows = DWC::kernel_rows;
115 constexpr auto kernel_cols = DWC::kernel_cols;
116 constexpr auto output_tile_rows = DWC::output_tile_rows;
117 constexpr auto output_tile_cols = DWC::output_tile_cols;
118 constexpr auto stride_rows = DWC::stride_rows;
119 constexpr auto stride_cols = DWC::stride_cols;
120
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000121 // Extract parameters
122 const int in_pad_top = Specialize ? InPadTop : _in_pad_top;
123 const int in_pad_left = Specialize ? InPadLeft : _in_pad_left;
124 const int in_pad_bottom = Specialize ? InPadBottom : _in_pad_bottom;
125 const int in_pad_right = Specialize ? InPadRight : _in_pad_right;
126 const int out_pad_bottom = Specialize ? OutPadBottom : _out_pad_bottom;
127 const int out_pad_right = Specialize ? OutPadRight : _out_pad_right;
128
Georgios Pinitas4074c992018-01-30 18:13:46 +0000129 // Compute valid ranges of the tile
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000130 const int in_cells_i = inner_tile_rows - in_pad_bottom;
131 const int in_cells_j = inner_tile_cols - in_pad_right;
132 const int out_cells_i = output_tile_rows - out_pad_bottom;
133 const int out_cells_j = output_tile_cols - out_pad_right;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000134
135 // Instantiate pointers
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000136 const float* __restrict__ inptr_base = inptr;
137 const float* __restrict__ wptr_base = weights;
138 float* __restrict__ outptr_base = outptr;
Georgios Pinitas4074c992018-01-30 18:13:46 +0000139
140 // Perform the depthwise convolution
141 int channels_remaining = n_channels;
142#ifdef __aarch64__
143 for (; channels_remaining >= 4; channels_remaining -= 4)
144 {
145 // Load input tile
146 float32x4_t u[inner_tile_rows][inner_tile_cols];
147 for (int i = 0; i < inner_tile_rows; i++)
148 {
149 const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
150 for (int j = 0; j < inner_tile_cols; j++)
151 {
152 if (i < in_pad_top || in_cells_i <= i ||
153 j < in_pad_left || in_cells_j <= j)
154 {
155 u[i][j] = vdupq_n_f32(0.0f);
156 }
157 else
158 {
159 u[i][j] = vld1q_f32(inptr_row + (j - in_pad_left)*in_col_stride);
160 }
161 }
162 }
163 inptr_base += 4;
164
165 // Load weights tile
166 float32x4_t w[kernel_rows][kernel_cols];
167 for (int i = 0; i < kernel_rows; i++)
168 {
169 const float* const wptr_row = wptr_base + i*weight_row_stride;
170 for (int j = 0; j < kernel_cols; j++)
171 {
172 w[i][j] = vld1q_f32(wptr_row + j*weight_col_stride);
173 }
174 }
175 wptr_base += 4;
176
177 // Perform the convolution
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000178 float32x4_t v[output_tile_rows][output_tile_cols];
Georgios Pinitas4074c992018-01-30 18:13:46 +0000179 for (int out_i = 0; out_i < out_cells_i; out_i++)
180 {
181 for (int out_j = 0; out_j < out_cells_j; out_j++)
182 {
183 // Base co-ordinate
184 const int base_i = out_i * stride_rows;
185 const int base_j = out_j * stride_cols;
186
187 // Fill the accumulator
188 for (int in_i = 0; in_i < kernel_rows; in_i++)
189 {
190 const int i = base_i + in_i;
191 for (int in_j = 0; in_j < kernel_cols; in_j++)
192 {
193 const int j = base_j + in_j;
194 if (in_i == 0 && in_j == 0)
195 {
196 // v[out_i][out_j] = w[in_i][in_j] * u[i][j];
197 v[out_i][out_j] = vmulq_f32(w[in_i][in_j], u[i][j]);
198 }
199 else
200 {
201 // v[out_i][out_j] += w[in_i][in_j] * u[i][j];
202 v[out_i][out_j] = vmlaq_f32(v[out_i][out_j], w[in_i][in_j], u[i][j]);
203 }
204 }
205 }
206 }
207 }
208
209 // Store the output tile
210 for (int i = 0; i < out_cells_i; i++)
211 {
212 float* const outptr_row = outptr_base + i*out_row_stride;
213 for (int j = 0; j < out_cells_j; j++)
214 {
215 vst1q_f32(outptr_row + j*out_col_stride, v[i][j]);
216 }
217 }
218 outptr_base += 4;
219 }
220#endif // __aarch64__
221 for (; channels_remaining; channels_remaining--)
222 {
223 // Load input tile
224 float u[inner_tile_rows][inner_tile_cols];
225 for (int i = 0; i < inner_tile_rows; i++)
226 {
227 const float* const inptr_row = inptr_base + (i - in_pad_top)*in_row_stride;
228 for (int j = 0; j < inner_tile_cols; j++)
229 {
230 if (i < in_pad_top || in_cells_i <= i ||
231 j < in_pad_left || in_cells_j <= j)
232 {
233 u[i][j] = static_cast<float>(0);
234 }
235 else
236 {
237 u[i][j] = *(inptr_row + (j - in_pad_left)*in_col_stride);
238 }
239 }
240 }
241 inptr_base++;
242
243 // Load weights tile
244 float w[kernel_rows][kernel_cols];
245 for (int i = 0; i < kernel_rows; i++)
246 {
247 const float* const wptr_row = wptr_base + i*weight_row_stride;
248 for (int j = 0; j < kernel_cols; j++)
249 {
250 w[i][j] = *(wptr_row + j*weight_col_stride);
251 }
252 }
253 wptr_base++;
254
255 // Perform the convolution
Georgios Pinitasbe0ae932018-03-13 13:08:12 +0000256 float v[output_tile_rows][output_tile_cols];
Georgios Pinitas4074c992018-01-30 18:13:46 +0000257 for (int out_i = 0; out_i < out_cells_i; out_i++)
258 {
259 for (int out_j = 0; out_j < out_cells_j; out_j++)
260 {
261 // Clear the accumulator
262 v[out_i][out_j] = static_cast<float>(0);
263
264 // Base co-ordinate
265 const int base_i = out_i * stride_rows;
266 const int base_j = out_j * stride_cols;
267
268 // Fill the accumulator
269 for (int in_i = 0; in_i < kernel_rows; in_i++)
270 {
271 const int i = base_i + in_i;
272 for (int in_j = 0; in_j < kernel_cols; in_j++)
273 {
274 const int j = base_j + in_j;
275 v[out_i][out_j] += w[in_i][in_j] * u[i][j];
276 }
277 }
278 }
279 }
280
281 // Store the output tile
282 for (int i = 0; i < out_cells_i; i++)
283 {
284 float* const outptr_row = outptr_base + i*out_row_stride;
285 for (int j = 0; j < out_cells_j; j++)
286 {
287 *(outptr_row + j*out_col_stride) = v[i][j];
288 }
289 }
290 outptr_base++;
291 }
292}
293
294} // namespace depthwise