blob: 6be50fdb0d6c1fe67da23696eccca4b5d027d465 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Georgios Pinitas57c033b2018-02-15 12:29:44 +00002 * Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
25
26#include "arm_compute/core/Helpers.h"
27#include "arm_compute/core/NEON/NEFixedPoint.h"
28#include "arm_compute/core/NEON/NEMath.h"
Georgios Pinitas57c033b2018-02-15 12:29:44 +000029#include "arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010030#include "arm_compute/core/TensorInfo.h"
31#include "arm_compute/core/Utils.h"
32#include "arm_compute/core/Validate.h"
33#include "arm_compute/core/Window.h"
34
Georgios Pinitas57c033b2018-02-15 12:29:44 +000035#include <map>
36
Anthony Barbier6ff3b192017-09-04 18:44:23 +010037using namespace arm_compute;
38
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +000039namespace
Anthony Barbier6ff3b192017-09-04 18:44:23 +010040{
Georgios Pinitas57c033b2018-02-15 12:29:44 +000041Status
42validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var,
43 const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info)
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +000044{
45 ARM_COMPUTE_UNUSED(epsilon);
Georgios Pinitas57c033b2018-02-15 12:29:44 +000046 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16,
47 DataType::F32);
48
49 if(act_info.enabled())
50 {
51 ActivationLayerInfo::ActivationFunction act = act_info.activation();
52 ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
53 ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
54 && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
55 ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
56 }
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +000057
58 if(nullptr != output)
59 {
60 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
Michele Di Giorgio0cbb9272018-03-01 16:56:48 +000061 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +000062 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
63 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
64 }
65
Michele Di Giorgio4d336302018-03-02 09:43:54 +000066 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
67 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var);
68 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
69 if(beta != nullptr)
70 {
71 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
72 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, beta);
73 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
74 }
75 if(gamma != nullptr)
76 {
77 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
78 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, gamma);
79 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
80 }
Michele Di Giorgio0cbb9272018-03-01 16:56:48 +000081 ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +000082
Georgios Pinitas631c41a2017-12-06 11:53:03 +000083 return Status{};
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +000084}
85
Georgios Pinitas631c41a2017-12-06 11:53:03 +000086std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +000087{
Michele Di Giorgio4d336302018-03-02 09:43:54 +000088 if(output != nullptr)
89 {
90 // Output tensor auto initialization if not yet initialized
91 auto_init_if_empty(*output, *input->clone());
92 }
93
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +000094 unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
95
96 Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
97 AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
98 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
99 bool window_changed = update_window_and_padding(win, input_access, output_access);
100 output_access.set_valid_region(win, input->valid_region());
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000101 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000102 return std::make_pair(err, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100103}
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000104} //namespace
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100105
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000106template <bool fused_activation>
107void NEBatchNormalizationLayerKernel::batch_normalization_qs8(const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100108{
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000109 static_assert(!fused_activation, "Activation is not supported for QS8");
110
111 Iterator input(_input, window);
112 Iterator output(_output, window);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100113
114 // Hold information about the current feature map we are iterating.
115 // Only compute denominator and NEON vectors once per feature map.
116 int slice = -1;
117
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000118 const int fixed_point_position = _input->info()->fixed_point_position();
119 const auto input_mean = reinterpret_cast<const qint8_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
120 const auto input_var = reinterpret_cast<const qint8_t *>(_var->ptr_to_element(Coordinates(0, 0)));
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000121 const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const qint8_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
122 const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const qint8_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100123
124 qint8x16_t mean_vec = vdupq_n_qs8(0);
125 qint8x16_t var_vec = vdupq_n_qs8(0);
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000126 qint8x16_t gamma_vec = vdupq_n_qs8(sqcvt_qs8_f32(1, fixed_point_position));
127 qint8x16_t beta_vec = vdupq_n_qs8(sqcvt_qs8_f32(0, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100128 qint8x16_t denominator = vdupq_n_qs8(0);
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000129 const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(_epsilon, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100130 execute_window_loop(window, [&](const Coordinates & id)
131 {
132 if(slice != id.z())
133 {
134 // Conctruct vectors
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000135 mean_vec = vdupq_n_qs8(*(input_mean + id.z()));
136 var_vec = vdupq_n_qs8(*(input_var + id.z()));
137 if(input_gamma != nullptr)
138 {
139 gamma_vec = vdupq_n_qs8(*(input_gamma + id.z()));
140 }
141 if(input_beta != nullptr)
142 {
143 beta_vec = vdupq_n_qs8(*(input_beta + id.z()));
144 }
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100145
146 // Calculate denominator
147 denominator = vqinvsqrtq_qs8(vqaddq_qs8(var_vec, epsilon_vec), fixed_point_position);
148 slice = id.z();
149 }
150
151 // Calculate x bar and store results
152 const qint8x16_t numerator = vqsubq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), mean_vec);
153 const qint8x16_t x_bar = vqmulq_qs8(numerator, denominator, fixed_point_position);
154 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmlaq_qs8(beta_vec, x_bar, gamma_vec, fixed_point_position));
155 },
156 input, output);
157}
158
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000159template <bool fused_activation>
160void NEBatchNormalizationLayerKernel::batch_normalization_qs16(const Window &window)
Michalis Spyroubbd3d602017-06-21 17:29:40 +0100161{
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000162 static_assert(!fused_activation, "Activation is not supported for QS16");
163
164 Iterator input(_input, window);
165 Iterator output(_output, window);
Michalis Spyroubbd3d602017-06-21 17:29:40 +0100166
167 // Hold information about the current feature map we are iterating.
168 // Only compute denominator and NEON vectors once per feature map.
169 int slice = -1;
170
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000171 const int fixed_point_position = _input->info()->fixed_point_position();
172 const auto input_mean = reinterpret_cast<const qint16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
173 const auto input_var = reinterpret_cast<const qint16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000174 const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const qint16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
175 const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const qint16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
Michalis Spyroubbd3d602017-06-21 17:29:40 +0100176
177 qint16x8_t mean_vec = vdupq_n_qs16(0);
178 qint16x8_t var_vec = vdupq_n_qs16(0);
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000179 qint16x8_t gamma_vec = vdupq_n_qs16(sqcvt_qs16_f32(1, fixed_point_position));
180 qint16x8_t beta_vec = vdupq_n_qs16(sqcvt_qs16_f32(0, fixed_point_position));
Michalis Spyroubbd3d602017-06-21 17:29:40 +0100181 qint16x8_t denominator = vdupq_n_qs16(0);
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000182 const qint16x8_t epsilon_vec = vdupq_n_qs16(sqcvt_qs16_f32(_epsilon, fixed_point_position));
Michalis Spyroubbd3d602017-06-21 17:29:40 +0100183 execute_window_loop(window, [&](const Coordinates & id)
184 {
185 if(slice != id.z())
186 {
187 // Conctruct vectors
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000188 mean_vec = vdupq_n_qs16(*(input_mean + id.z()));
189 var_vec = vdupq_n_qs16(*(input_var + id.z()));
190 if(input_gamma != nullptr)
191 {
192 gamma_vec = vdupq_n_qs16(*(input_gamma + id.z()));
193 }
194 if(input_beta != nullptr)
195 {
196 beta_vec = vdupq_n_qs16(*(input_beta + id.z()));
197 }
Michalis Spyroubbd3d602017-06-21 17:29:40 +0100198
199 // Calculate denominator
200 denominator = vqinvsqrtq_qs16(vqaddq_qs16(var_vec, epsilon_vec), fixed_point_position);
201 slice = id.z();
202 }
203
204 // Calculate x bar and store results
205 const qint16x8_t numerator = vqsubq_qs16(vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr())), mean_vec);
206 const qint16x8_t x_bar = vqmulq_qs16(numerator, denominator, fixed_point_position);
207 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmlaq_qs16(beta_vec, x_bar, gamma_vec, fixed_point_position));
208 },
209 input, output);
210}
211
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000212template <bool fused_activation>
Michele Di Giorgio0cbb9272018-03-01 16:56:48 +0000213void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw(const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100214{
Michele Di Giorgio0cbb9272018-03-01 16:56:48 +0000215 static_assert(!fused_activation, "Activation is not supported for FP16");
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100216
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000217 ARM_COMPUTE_UNUSED(window);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000218#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000219 Iterator input(_input, window);
220 Iterator output(_output, window);
Pablo Tello8fda1cb2017-07-05 15:20:38 +0100221
222 // Hold information about the current feature map we are iterating.
223 // Only compute denominator and NEON vectors once per feature map.
224 int slice = -1;
225
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000226 const auto input_mean = reinterpret_cast<const float16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
227 const auto input_var = reinterpret_cast<const float16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000228 const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
229 const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const float16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
Pablo Tello8fda1cb2017-07-05 15:20:38 +0100230
231 float16x8_t mean_vec = vdupq_n_f16(0.0);
232 float16x8_t var_vec = vdupq_n_f16(0.0);
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000233 float16x8_t gamma_vec = vdupq_n_f16(1.0);
Pablo Tello8fda1cb2017-07-05 15:20:38 +0100234 float16x8_t beta_vec = vdupq_n_f16(0.0);
235 float16x8_t denominator = vdupq_n_f16(0.0);
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000236 const float16x8_t epsilon_vec = vdupq_n_f16(_epsilon);
Pablo Tello8fda1cb2017-07-05 15:20:38 +0100237 execute_window_loop(window, [&](const Coordinates & id)
238 {
239 if(slice != id.z())
240 {
241 // Conctruct vectors
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000242 mean_vec = vdupq_n_f16(*(input_mean + id.z()));
243 var_vec = vdupq_n_f16(*(input_var + id.z()));
244 if(input_gamma != nullptr)
245 {
246 gamma_vec = vdupq_n_f16(*(input_gamma + id.z()));
247 }
248 if(input_beta != nullptr)
249 {
250 beta_vec = vdupq_n_f16(*(input_beta + id.z()));
251 }
Pablo Tello8fda1cb2017-07-05 15:20:38 +0100252
253 // Calculate denominator
254 denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec));
255 slice = id.z();
256 }
257
258 // Calculate x bar and store results
259 const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
260 const float16x8_t x_bar = vmulq_f16(numerator, denominator);
261 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec)));
262 },
263 input, output);
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000264#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000265}
266
Michele Di Giorgio0cbb9272018-03-01 16:56:48 +0000267template <bool fused_activation>
268void NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc(const Window &window)
269{
270 static_assert(!fused_activation, "Activation is not supported for FP16");
271
272 ARM_COMPUTE_UNUSED(window);
273#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
274 Iterator input(_input, window);
275 Iterator output(_output, window);
276
277 const auto input_mean = reinterpret_cast<const float16_t *>(_mean->ptr_to_element(Coordinates(0, 0)));
278 const auto input_var = reinterpret_cast<const float16_t *>(_var->ptr_to_element(Coordinates(0, 0)));
279 const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float16_t *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
280 const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const float16_t *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
281
282 const float16x8_t epsilon_vec = vdupq_n_f16(_epsilon);
283 execute_window_loop(window, [&](const Coordinates & id)
284 {
285 // Conctruct vectors
286 const float16x8_t mean_vec = vld1q_f16(input_mean + id.x());
287 const float16x8_t var_vec = vld1q_f16(input_var + id.x());
288 const float16x8_t gamma_vec = (input_gamma != nullptr) ? vld1q_f16(input_gamma + id.x()) : vdupq_n_f16(1.0);
289 const float16x8_t beta_vec = (input_beta != nullptr) ? vld1q_f16(input_beta + id.x()) : vdupq_n_f16(0.0);
290 // Calculate denominator
291 const float16x8_t denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec));
292
293 // Calculate x bar and store results
294 const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
295 const float16x8_t x_bar = vmulq_f16(numerator, denominator);
296 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec)));
297 },
298 input, output);
299#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
300}
301
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000302template <bool fused_activation, typename F>
Michele Di Giorgio0cbb9272018-03-01 16:56:48 +0000303void NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw(const Window &window)
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000304{
305 Iterator input(_input, window);
306 Iterator output(_output, window);
307
308 F activation_functor(_act_info);
309
310 // Hold information about the current feature map we are iterating.
311 // Only compute denominator and NEON vectors once per feature map.
312 int slice = -1;
313
314 const auto input_mean = reinterpret_cast<const float *>(_mean->ptr_to_element(Coordinates(0, 0)));
315 const auto input_var = reinterpret_cast<const float *>(_var->ptr_to_element(Coordinates(0, 0)));
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000316 const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
317 const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const float *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000318
319 float32x4_t mean_vec = vdupq_n_f32(0.0);
320 float32x4_t var_vec = vdupq_n_f32(0.0);
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000321 float32x4_t gamma_vec = vdupq_n_f32(1.0);
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000322 float32x4_t beta_vec = vdupq_n_f32(0.0);
323 float32x4_t denominator = vdupq_n_f32(0.0);
324 const float32x4_t epsilon_vec = vdupq_n_f32(_epsilon);
325 execute_window_loop(window, [&](const Coordinates & id)
326 {
327 if(slice != id.z())
328 {
329 // Conctruct vectors
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000330 mean_vec = vdupq_n_f32(*(input_mean + id.z()));
331 var_vec = vdupq_n_f32(*(input_var + id.z()));
332 if(input_gamma != nullptr)
333 {
334 gamma_vec = vdupq_n_f32(*(input_gamma + id.z()));
335 }
336 if(input_beta != nullptr)
337 {
338 beta_vec = vdupq_n_f32(*(input_beta + id.z()));
339 }
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000340
341 // Calculate denominator
342 denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
343 slice = id.z();
344 }
345
346 // Calculate x bar
347 const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
348 const float32x4_t x_bar = vmulq_f32(numerator, denominator);
349 float32x4_t res = vmlaq_f32(beta_vec, x_bar, gamma_vec);
350
351 // Perform fused activation
352 if(fused_activation)
353 {
354 activation_functor(res);
355 }
356
357 // Store results
358 vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
359 },
360 input, output);
361}
362
Michele Di Giorgio0cbb9272018-03-01 16:56:48 +0000363template <bool fused_activation, typename F>
364void NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc(const Window &window)
365{
366 Iterator input(_input, window);
367 Iterator output(_output, window);
368
369 F activation_functor(_act_info);
370
371 const auto input_mean = reinterpret_cast<const float *>(_mean->ptr_to_element(Coordinates(0, 0)));
372 const auto input_var = reinterpret_cast<const float *>(_var->ptr_to_element(Coordinates(0, 0)));
373 const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const float *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
374 const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const float *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
375
376 const float32x4_t epsilon_vec = vdupq_n_f32(_epsilon);
377 execute_window_loop(window, [&](const Coordinates & id)
378 {
379 // Conctruct vectors
380 const float32x4_t mean_vec = vld1q_f32(input_mean + id.x());
381 const float32x4_t var_vec = vld1q_f32(input_var + id.x());
382 const float32x4_t gamma_vec = (input_gamma != nullptr) ? vld1q_f32(input_gamma + id.x()) : vdupq_n_f32(1.0);
383 const float32x4_t beta_vec = (input_beta != nullptr) ? vld1q_f32(input_beta + id.x()) : vdupq_n_f32(0.0);
384 // Calculate denominator
385 const float32x4_t denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
386
387 // Calculate x bar
388 const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
389 const float32x4_t x_bar = vmulq_f32(numerator, denominator);
390 float32x4_t res = vmlaq_f32(beta_vec, x_bar, gamma_vec);
391
392 // Perform fused activation
393 if(fused_activation)
394 {
395 activation_functor(res);
396 }
397
398 // Store results
399 vst1q_f32(reinterpret_cast<float *>(output.ptr()), res);
400 },
401 input, output);
402}
403
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000404void NEBatchNormalizationLayerKernel::configure_non_fused()
405{
Michele Di Giorgio0cbb9272018-03-01 16:56:48 +0000406 const bool is_nhwc = _input->info()->data_layout() == DataLayout::NHWC;
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000407 switch(_input->info()->data_type())
408 {
409 case DataType::QS8:
410 _func = &NEBatchNormalizationLayerKernel::batch_normalization_qs8<false>;
411 break;
412 case DataType::QS16:
413 _func = &NEBatchNormalizationLayerKernel::batch_normalization_qs16<false>;
414 break;
415 case DataType::F16:
Michele Di Giorgio0cbb9272018-03-01 16:56:48 +0000416 _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nhwc<false> : &NEBatchNormalizationLayerKernel::batch_normalization_fp16_nchw<false>;
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000417 break;
418 case DataType::F32:
Michele Di Giorgio0cbb9272018-03-01 16:56:48 +0000419 _func = (is_nhwc) ? &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<false, ::detail::dummy<float, 4>> :
420 &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<false, ::detail::dummy<float, 4>>;
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000421 break;
422 default:
423 ARM_COMPUTE_ERROR("Element size not supported");
424 break;
425 }
426}
427
428void NEBatchNormalizationLayerKernel::configure_fused()
429{
Michele Di Giorgio0cbb9272018-03-01 16:56:48 +0000430 // NCHW Fused Batched Normalization with activation functions : FP32
431 static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw =
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000432 {
Michele Di Giorgio0cbb9272018-03-01 16:56:48 +0000433 { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::relu<float, 4>> },
434 { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::brelu<float, 4>> },
435 { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nchw<true, ::detail::lubrelu<float, 4>> }
436 };
437 // NHWC Fused Batched Normalization with activation functions : FP32
438 static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nhwc =
439 {
440 { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::relu<float, 4>> },
441 { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::brelu<float, 4>> },
442 { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_fp32_nhwc<true, ::detail::lubrelu<float, 4>> }
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000443 };
444
445 switch(_input->info()->data_type())
446 {
447 case DataType::F32:
Michele Di Giorgio0cbb9272018-03-01 16:56:48 +0000448 _func = (_input->info()->data_layout() == DataLayout::NHWC) ? bn_fused_map_f32_nhwc[_act_info.activation()] : bn_fused_map_f32_nchw[_act_info.activation()];
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000449 break;
450 default:
451 ARM_COMPUTE_ERROR("Element size not supported");
452 break;
453 }
454}
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000455
456NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000457 : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(), _act_info()
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000458{
459}
Pablo Tello8fda1cb2017-07-05 15:20:38 +0100460
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000461void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
462 const ITensor *mean, const ITensor *var,
463 const ITensor *beta, const ITensor *gamma,
464 float epsilon, ActivationLayerInfo act_info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100465{
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000466 ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000467
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000468 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr,
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000469 mean->info(), var->info(),
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000470 (beta != nullptr) ? beta->info() : nullptr,
471 (gamma != nullptr) ? gamma->info() : nullptr,
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000472 epsilon, act_info));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100473
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000474 _input = input;
475 _output = input;
476 _mean = mean;
477 _var = var;
478 _gamma = gamma;
479 _beta = beta;
480 _epsilon = epsilon;
481 _act_info = act_info;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100482
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000483 const bool run_in_place = (output == nullptr) || (output == input);
484 if(!run_in_place)
Georgios Pinitas409ee0a2017-08-18 10:16:09 +0100485 {
Georgios Pinitas409ee0a2017-08-18 10:16:09 +0100486 _output = output;
487 }
488
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000489 // Configure activation function to run
490 if(_act_info.enabled())
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100491 {
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000492 configure_fused();
493 }
494 else
495 {
496 configure_non_fused();
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100497 }
498
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000499 // Configure kernel window
Michele Di Giorgio4d336302018-03-02 09:43:54 +0000500 auto win_config = validate_and_configure_window(input->info(), (run_in_place) ? nullptr : output->info());
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000501 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
502 INEKernel::configure(win_config.second);
503}
504
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000505Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
506 const ITensorInfo *mean, const ITensorInfo *var,
507 const ITensorInfo *beta, const ITensorInfo *gamma,
508 float epsilon, ActivationLayerInfo act_info)
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000509{
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000510 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000511 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output ? output->clone().get() : nullptr).first);
512
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000513 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100514}
515
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100516void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100517{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100518 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100519 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
520 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
521 ARM_COMPUTE_ERROR_ON(_func == nullptr);
522
Georgios Pinitas57c033b2018-02-15 12:29:44 +0000523 (this->*_func)(window);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100524}