blob: f5144c6bf39138099d6abf4d64ae054d75439a5d [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
25
26#include "arm_compute/core/Helpers.h"
27#include "arm_compute/core/NEON/NEFixedPoint.h"
28#include "arm_compute/core/NEON/NEMath.h"
29#include "arm_compute/core/TensorInfo.h"
30#include "arm_compute/core/Utils.h"
31#include "arm_compute/core/Validate.h"
32#include "arm_compute/core/Window.h"
33
34using namespace arm_compute;
35
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +000036namespace
Anthony Barbier6ff3b192017-09-04 18:44:23 +010037{
Georgios Pinitas631c41a2017-12-06 11:53:03 +000038Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon)
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +000039{
40 ARM_COMPUTE_UNUSED(epsilon);
41 ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
42
43 if(nullptr != output)
44 {
45 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
46 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
47 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
48 }
49
50 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var, beta, gamma);
51 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, mean, var, beta, gamma);
52 ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var, beta, gamma);
53 ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != mean->dimension(0));
54
Georgios Pinitas631c41a2017-12-06 11:53:03 +000055 return Status{};
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +000056}
57
Georgios Pinitas631c41a2017-12-06 11:53:03 +000058std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +000059{
60 unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
61
62 Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
63 AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
64 AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
65 bool window_changed = update_window_and_padding(win, input_access, output_access);
66 output_access.set_valid_region(win, input->valid_region());
Georgios Pinitas631c41a2017-12-06 11:53:03 +000067 Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +000068 return std::make_pair(err, win);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010069}
70
Georgios Pinitas409ee0a2017-08-18 10:16:09 +010071void batch_normalization_q8(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010072{
73 Iterator input(in, window);
74 Iterator output(out, window);
75
76 // Hold information about the current feature map we are iterating.
77 // Only compute denominator and NEON vectors once per feature map.
78 int slice = -1;
79
Michalis Spyroubbd3d602017-06-21 17:29:40 +010080 const int fixed_point_position = in->info()->fixed_point_position();
Anthony Barbier6ff3b192017-09-04 18:44:23 +010081 const auto input_mean = reinterpret_cast<const qint8_t *>(mean->ptr_to_element(Coordinates(0, 0)));
82 const auto input_var = reinterpret_cast<const qint8_t *>(var->ptr_to_element(Coordinates(0, 0)));
83 const auto input_gamma = reinterpret_cast<const qint8_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
84 const auto input_beta = reinterpret_cast<const qint8_t *>(beta->ptr_to_element(Coordinates(0, 0)));
85
86 qint8x16_t mean_vec = vdupq_n_qs8(0);
87 qint8x16_t var_vec = vdupq_n_qs8(0);
88 qint8x16_t gamma_vec = vdupq_n_qs8(0);
89 qint8x16_t beta_vec = vdupq_n_qs8(0);
90 qint8x16_t denominator = vdupq_n_qs8(0);
Georgios Pinitas21efeb42017-07-04 12:47:17 +010091 const qint8x16_t epsilon_vec = vdupq_n_qs8(sqcvt_qs8_f32(epsilon, fixed_point_position));
Anthony Barbier6ff3b192017-09-04 18:44:23 +010092 execute_window_loop(window, [&](const Coordinates & id)
93 {
94 if(slice != id.z())
95 {
96 // Conctruct vectors
97 mean_vec = vdupq_n_qs8(*(input_mean + id.z()));
98 var_vec = vdupq_n_qs8(*(input_var + id.z()));
99 gamma_vec = vdupq_n_qs8(*(input_gamma + id.z()));
100 beta_vec = vdupq_n_qs8(*(input_beta + id.z()));
101
102 // Calculate denominator
103 denominator = vqinvsqrtq_qs8(vqaddq_qs8(var_vec, epsilon_vec), fixed_point_position);
104 slice = id.z();
105 }
106
107 // Calculate x bar and store results
108 const qint8x16_t numerator = vqsubq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), mean_vec);
109 const qint8x16_t x_bar = vqmulq_qs8(numerator, denominator, fixed_point_position);
110 vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), vqmlaq_qs8(beta_vec, x_bar, gamma_vec, fixed_point_position));
111 },
112 input, output);
113}
114
Georgios Pinitas409ee0a2017-08-18 10:16:09 +0100115void batch_normalization_q16(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
Michalis Spyroubbd3d602017-06-21 17:29:40 +0100116{
117 Iterator input(in, window);
118 Iterator output(out, window);
119
120 // Hold information about the current feature map we are iterating.
121 // Only compute denominator and NEON vectors once per feature map.
122 int slice = -1;
123
124 const int fixed_point_position = in->info()->fixed_point_position();
125 const auto input_mean = reinterpret_cast<const qint16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
126 const auto input_var = reinterpret_cast<const qint16_t *>(var->ptr_to_element(Coordinates(0, 0)));
127 const auto input_gamma = reinterpret_cast<const qint16_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
128 const auto input_beta = reinterpret_cast<const qint16_t *>(beta->ptr_to_element(Coordinates(0, 0)));
129
130 qint16x8_t mean_vec = vdupq_n_qs16(0);
131 qint16x8_t var_vec = vdupq_n_qs16(0);
132 qint16x8_t gamma_vec = vdupq_n_qs16(0);
133 qint16x8_t beta_vec = vdupq_n_qs16(0);
134 qint16x8_t denominator = vdupq_n_qs16(0);
135 const qint16x8_t epsilon_vec = vdupq_n_qs16(sqcvt_qs16_f32(epsilon, fixed_point_position));
136 execute_window_loop(window, [&](const Coordinates & id)
137 {
138 if(slice != id.z())
139 {
140 // Conctruct vectors
141 mean_vec = vdupq_n_qs16(*(input_mean + id.z()));
142 var_vec = vdupq_n_qs16(*(input_var + id.z()));
143 gamma_vec = vdupq_n_qs16(*(input_gamma + id.z()));
144 beta_vec = vdupq_n_qs16(*(input_beta + id.z()));
145
146 // Calculate denominator
147 denominator = vqinvsqrtq_qs16(vqaddq_qs16(var_vec, epsilon_vec), fixed_point_position);
148 slice = id.z();
149 }
150
151 // Calculate x bar and store results
152 const qint16x8_t numerator = vqsubq_qs16(vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr())), mean_vec);
153 const qint16x8_t x_bar = vqmulq_qs16(numerator, denominator, fixed_point_position);
154 vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqmlaq_qs16(beta_vec, x_bar, gamma_vec, fixed_point_position));
155 },
156 input, output);
157}
158
Georgios Pinitas409ee0a2017-08-18 10:16:09 +0100159void batch_normalization_fp32(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100160{
161 Iterator input(in, window);
162 Iterator output(out, window);
163
164 // Hold information about the current feature map we are iterating.
165 // Only compute denominator and NEON vectors once per feature map.
166 int slice = -1;
167
168 const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
169 const auto input_var = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
170 const auto input_gamma = reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0)));
171 const auto input_beta = reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0)));
172
173 float32x4_t mean_vec = vdupq_n_f32(0.0);
174 float32x4_t var_vec = vdupq_n_f32(0.0);
175 float32x4_t gamma_vec = vdupq_n_f32(0.0);
176 float32x4_t beta_vec = vdupq_n_f32(0.0);
177 float32x4_t denominator = vdupq_n_f32(0.0);
178 const float32x4_t epsilon_vec = vdupq_n_f32(epsilon);
179 execute_window_loop(window, [&](const Coordinates & id)
180 {
181 if(slice != id.z())
182 {
183 // Conctruct vectors
184 mean_vec = vdupq_n_f32(*(input_mean + id.z()));
185 var_vec = vdupq_n_f32(*(input_var + id.z()));
186 gamma_vec = vdupq_n_f32(*(input_gamma + id.z()));
187 beta_vec = vdupq_n_f32(*(input_beta + id.z()));
188
189 // Calculate denominator
190 denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec));
191 slice = id.z();
192 }
193
194 // Calculate x bar and store results
195 const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast<const float *>(input.ptr())), mean_vec);
196 const float32x4_t x_bar = vmulq_f32(numerator, denominator);
197 vst1q_f32(reinterpret_cast<float *>(output.ptr()), vmlaq_f32(beta_vec, x_bar, gamma_vec));
198 },
199 input, output);
200}
201
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000202#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Georgios Pinitas409ee0a2017-08-18 10:16:09 +0100203void batch_normalization_fp16(ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window)
Pablo Tello8fda1cb2017-07-05 15:20:38 +0100204{
205 Iterator input(in, window);
206 Iterator output(out, window);
207
208 // Hold information about the current feature map we are iterating.
209 // Only compute denominator and NEON vectors once per feature map.
210 int slice = -1;
211
212 const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
213 const auto input_var = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
214 const auto input_gamma = reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0)));
215 const auto input_beta = reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0)));
216
217 float16x8_t mean_vec = vdupq_n_f16(0.0);
218 float16x8_t var_vec = vdupq_n_f16(0.0);
219 float16x8_t gamma_vec = vdupq_n_f16(0.0);
220 float16x8_t beta_vec = vdupq_n_f16(0.0);
221 float16x8_t denominator = vdupq_n_f16(0.0);
222 const float16x8_t epsilon_vec = vdupq_n_f16(epsilon);
223 execute_window_loop(window, [&](const Coordinates & id)
224 {
225 if(slice != id.z())
226 {
227 // Conctruct vectors
228 mean_vec = vdupq_n_f16(*(input_mean + id.z()));
229 var_vec = vdupq_n_f16(*(input_var + id.z()));
230 gamma_vec = vdupq_n_f16(*(input_gamma + id.z()));
231 beta_vec = vdupq_n_f16(*(input_beta + id.z()));
232
233 // Calculate denominator
234 denominator = vinvsqrtq_f16(vaddq_f16(var_vec, epsilon_vec));
235 slice = id.z();
236 }
237
238 // Calculate x bar and store results
239 const float16x8_t numerator = vsubq_f16(vld1q_f16(reinterpret_cast<const float16_t *>(input.ptr())), mean_vec);
240 const float16x8_t x_bar = vmulq_f16(numerator, denominator);
241 vst1q_f16(reinterpret_cast<float16_t *>(output.ptr()), vaddq_f16(beta_vec, vmulq_f16(x_bar, gamma_vec)));
242 },
243 input, output);
244}
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000245#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000246} // namespace
247
248NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
249 : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon()
250{
251}
Pablo Tello8fda1cb2017-07-05 15:20:38 +0100252
Georgios Pinitas409ee0a2017-08-18 10:16:09 +0100253void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100254{
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000255 ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var, beta, gamma);
256
257 ITensorInfo *output_info = nullptr;
258
259 if(nullptr != output)
260 {
261 // Output tensor auto initialization if not yet initialized
262 auto_init_if_empty(*output->info(), *input->info());
263
264 output_info = output->info();
265 }
266
267 ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output_info, mean->info(), var->info(), beta->info(), gamma->info(), epsilon));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100268
269 _input = input;
Georgios Pinitas409ee0a2017-08-18 10:16:09 +0100270 _output = input;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100271 _mean = mean;
272 _var = var;
273 _gamma = gamma;
274 _beta = beta;
275 _epsilon = epsilon;
276
Georgios Pinitas409ee0a2017-08-18 10:16:09 +0100277 if(output != nullptr)
278 {
Georgios Pinitas409ee0a2017-08-18 10:16:09 +0100279 _output = output;
280 }
281
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100282 switch(input->info()->data_type())
283 {
284 case DataType::QS8:
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000285 _func = &batch_normalization_q8;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100286 break;
Michalis Spyroubbd3d602017-06-21 17:29:40 +0100287 case DataType::QS16:
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000288 _func = &batch_normalization_q16;
Michalis Spyroubbd3d602017-06-21 17:29:40 +0100289 break;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100290 case DataType::F32:
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000291 _func = &batch_normalization_fp32;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100292 break;
Pablo Tello8fda1cb2017-07-05 15:20:38 +0100293 case DataType::F16:
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000294#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000295 _func = &batch_normalization_fp16;
Pablo Tello8fda1cb2017-07-05 15:20:38 +0100296 break;
Ioan-Cristian Szabo5edbd1c2017-11-13 13:34:08 +0000297#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100298 default:
299 ARM_COMPUTE_ERROR("Element size not supported");
300 break;
301 }
302
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000303 // Configure kernel window
304 auto win_config = validate_and_configure_window(input->info(), output_info);
305 ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
306 INEKernel::configure(win_config.second);
307}
308
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000309Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta,
310 const ITensorInfo *gamma,
311 float epsilon)
Ioan-Cristian Szabo303be902017-11-27 16:31:10 +0000312{
313 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon));
314 ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output ? output->clone().get() : nullptr).first);
315
Georgios Pinitas631c41a2017-12-06 11:53:03 +0000316 return Status{};
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100317}
318
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100319void NEBatchNormalizationLayerKernel::run(const Window &window, const ThreadInfo &info)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100320{
Moritz Pflanzerc186b572017-09-07 09:48:04 +0100321 ARM_COMPUTE_UNUSED(info);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100322 ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
323 ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
324 ARM_COMPUTE_ERROR_ON(_func == nullptr);
325
326 (*_func)(_input, _output, _mean, _var, _beta, _gamma, _epsilon, window);
327}