blob: f9b3cd36203eadc6a16e3157c415e076c154b161 [file] [log] [blame]
Manuel Bottini79f88e62019-09-18 15:02:53 +01001/*
Viet-Hoa Do6829e022024-01-16 16:23:24 +00002 * Copyright (c) 2019-2021, 2024 Arm Limited.
Manuel Bottini79f88e62019-09-18 15:02:53 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Pablo Tello5c3eeec2021-04-26 15:39:05 +010026#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) & defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +000027/** This function computes the mean and variance of each plane of the input tensor and provides it as output.
28 *
29 * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
30 * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
31 * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
32 *
33 * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
34 * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
35 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
36 * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
37 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
38 * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
39 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
40 * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
41 * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
42 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
43 * @param[out] output_ptr (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
44 * @param[in] output_stride_x (Optional) Stride of the destination tensor in X dimension (in bytes)
45 * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
46 * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y dimension (in bytes)
47 * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
48 * @param[in] output_stride_z (Optional) Stride of the destination tensor in Z dimension (in bytes)
49 * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
50 * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
51 */
52__kernel void compute_mean_var(
53 TENSOR4D_DECLARATION(input),
54 TENSOR3D_DECLARATION(output))
55{
Viet-Hoa Do6829e022024-01-16 16:23:24 +000056 Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +000057 Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
58
59#if defined(NHWC)
Pablo Tello5c3eeec2021-04-26 15:39:05 +010060 const int ch = get_global_id(0); // Current channel
61 const int batch = get_global_id(1); // Current batch
62 const int elements_plane = DIM_Y * DIM_Z;
63 INTERNAL_DATA_TYPE part_sum = 0.f;
64 INTERNAL_DATA_TYPE part_sum_sq = 0.f;
65 const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
66
67 for(int i_w = 0; i_w < DIM_Y; ++i_w)
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +000068 {
Pablo Tello5c3eeec2021-04-26 15:39:05 +010069 for(int i_h = 0; i_h < DIM_Z; ++i_h)
70 {
71 INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE) * ((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
72 part_sum += data;
73 part_sum_sq += data * data;
74 }
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +000075 }
Pablo Tello5c3eeec2021-04-26 15:39:05 +010076
77 INTERNAL_DATA_TYPE mean = (part_sum / elements_plane);
78 INTERNAL_DATA_TYPE var = (part_sum_sq / elements_plane) - (mean * mean);
79 __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
80 *output_address0 = mean;
81 __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
82 *output_address1 = var;
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +000083#else // !defined(NHWC)
84 const int ch = get_global_id(2) % DIM_Z; // Current channel
85 const int batch = get_global_id(2) / DIM_Z; // Current batch
86 const int elements_plane = DIM_X * DIM_Y;
87
Pablo Tello5c3eeec2021-04-26 15:39:05 +010088 VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +000089 part_sum = 0.f;
Pablo Tello5c3eeec2021-04-26 15:39:05 +010090 VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +000091 part_sum_sq = 0.f;
92 // Calculate partial sum
93 for(int y = 0; y < DIM_Y; ++y)
94 {
95 int x = 0;
96 for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
97 {
98 // Load data
Pablo Tello5c3eeec2021-04-26 15:39:05 +010099 VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
100 data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE));
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000101 part_sum += data;
102 part_sum_sq += data * data;
103 }
104 // Left-overs loop
105 for(; x < DIM_X; ++x)
106 {
Pablo Tello5c3eeec2021-04-26 15:39:05 +0100107 INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)));
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000108 part_sum.s0 += data;
109 part_sum_sq.s0 += data * data;
110 }
111 }
112 // Perform reduction
113#if VEC_SIZE > 8
114 part_sum.s01234567 += part_sum.s89abcdef;
115 part_sum_sq.s01234567 += part_sum_sq.s89abcdef;
116#endif // VEC_SIZE > 8
117#if VEC_SIZE > 4
118 part_sum.s0123 += part_sum.s4567;
119 part_sum_sq.s0123 += part_sum_sq.s4567;
120#endif // VEC_SIZE > 4
121#if VEC_SIZE > 2
122 part_sum.s01 += part_sum.s23;
123 part_sum_sq.s01 += part_sum_sq.s23;
124#endif // VEC_SIZE > 2
125 part_sum.s0 += part_sum.s1;
126 part_sum_sq.s0 += part_sum_sq.s1;
127
Pablo Tello5c3eeec2021-04-26 15:39:05 +0100128 INTERNAL_DATA_TYPE sum = (INTERNAL_DATA_TYPE)part_sum.s0;
129 INTERNAL_DATA_TYPE sum_sq = (INTERNAL_DATA_TYPE)part_sum_sq.s0;
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000130
Pablo Tello5c3eeec2021-04-26 15:39:05 +0100131 const INTERNAL_DATA_TYPE mean = (sum / elements_plane);
132 const INTERNAL_DATA_TYPE var = (sum_sq / elements_plane) - (mean * mean);
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000133
Pablo Tello5c3eeec2021-04-26 15:39:05 +0100134 __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
135 *output_address0 = mean;
136 __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
137 *output_address1 = var;
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000138
139#endif // defined(NHWC)
140}
141#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */
142
Georgios Pinitas55a687d2020-01-30 12:00:23 +0000143#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
Manuel Bottini79f88e62019-09-18 15:02:53 +0100144/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
145 *
146 * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
147 * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
148 * @attention The scale scalar value applied to the normalized tensor should be passed using the -DGAMMA=value compile flag, e.g. -DGAMMA=1.3
149 * @attention The offset scalar value applied to the normalized tensor should be passed using the -DBETA=value compile flag, e.g. -DBETA=2.4
150 * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
151 * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
152 *
153 * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
154 * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
155 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
156 * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
157 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
158 * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
159 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
160 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
161 * @param[out] output_ptr (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
162 * @param[in] output_stride_x (Optional) Stride of the destination tensor in X dimension (in bytes)
163 * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
164 * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y dimension (in bytes)
165 * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
166 * @param[in] output_stride_z (Optional) Stride of the destination tensor in Z dimension (in bytes)
167 * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
168 * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
169 */
170__kernel void instance_normalization(
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000171 TENSOR4D_DECLARATION(input),
172 TENSOR3D_DECLARATION(mean_var)
Manuel Bottini79f88e62019-09-18 15:02:53 +0100173#ifndef IN_PLACE
174 ,
175 TENSOR4D_DECLARATION(output)
176#endif /* IN_PLACE */
177)
178{
Viet-Hoa Do6829e022024-01-16 16:23:24 +0000179 Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000180 Tensor3D mean_var = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(mean_var);
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000181#ifndef IN_PLACE
Viet-Hoa Do6829e022024-01-16 16:23:24 +0000182 Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output);
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000183#endif /* IN_PLACE */
184
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000185#if defined(NHWC)
186 const int ch = get_global_id(0); // Current channel
187 const int batch = get_global_id(2); // Current batch
188#else /* defined(NHWC) */
189 const int ch = get_global_id(2) % DIM_Z; // Current channel
190 const int batch = get_global_id(2) / DIM_Z; // Current batch
191#endif /* defined(NHWC) */
192
Pablo Tello5c3eeec2021-04-26 15:39:05 +0100193 const __global INTERNAL_DATA_TYPE *mean_ptr = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 0, batch);
194 const __global INTERNAL_DATA_TYPE *var_ptr = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 1, batch);
195 const INTERNAL_DATA_TYPE mean = (INTERNAL_DATA_TYPE) * mean_ptr;
196 const INTERNAL_DATA_TYPE var = (INTERNAL_DATA_TYPE) * var_ptr;
197 const INTERNAL_DATA_TYPE multip = GAMMA / sqrt(var + EPSILON);
198 const INTERNAL_DATA_TYPE beta = (INTERNAL_DATA_TYPE)BETA;
Manuel Bottini79f88e62019-09-18 15:02:53 +0100199
200#if defined(NHWC)
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000201 const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
202#ifndef IN_PLACE
203 const int out_offset = output_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
Manuel Bottini79f88e62019-09-18 15:02:53 +0100204#endif /* IN_PLACE */
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000205
Pablo Tello5c3eeec2021-04-26 15:39:05 +0100206 for(int i_w = 0; i_w < DIM_Y; ++i_w)
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000207 {
Pablo Tello5c3eeec2021-04-26 15:39:05 +0100208 for(int i_h = 0; i_h < DIM_Z; ++i_h)
209 {
210 __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000211#ifdef IN_PLACE
Pablo Tello5c3eeec2021-04-26 15:39:05 +0100212 __global DATA_TYPE *output_address = input_address;
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000213#else /* !IN_PLACE */
Pablo Tello5c3eeec2021-04-26 15:39:05 +0100214 __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000215#endif /* IN_PLACE */
Pablo Tello5c3eeec2021-04-26 15:39:05 +0100216 *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
217 }
Manuel Bottini79f88e62019-09-18 15:02:53 +0100218 }
Manuel Bottini79f88e62019-09-18 15:02:53 +0100219#else // !defined(NHWC)
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000220 for(int y = 0; y < DIM_Y; ++y)
Manuel Bottini79f88e62019-09-18 15:02:53 +0100221 {
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000222 int x = 0;
223 for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
224 {
225 __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
Manuel Bottini79f88e62019-09-18 15:02:53 +0100226#ifdef IN_PLACE
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000227 __global DATA_TYPE *output_address = input_address;
Manuel Bottini79f88e62019-09-18 15:02:53 +0100228#else /* !IN_PLACE */
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000229 __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
Manuel Bottini79f88e62019-09-18 15:02:53 +0100230#endif /* IN_PLACE */
231
Georgios Pinitas55a687d2020-01-30 12:00:23 +0000232 VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
233 data = CONVERT(VLOAD(VEC_SIZE)(0, input_address), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE));
Manuel Bottini79f88e62019-09-18 15:02:53 +0100234
Georgios Pinitas55a687d2020-01-30 12:00:23 +0000235 VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
236 res = (data - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000237 VSTORE(VEC_SIZE)
Georgios Pinitas55a687d2020-01-30 12:00:23 +0000238 (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, output_address);
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000239 }
240 // Left-overs loop
241 for(; x < DIM_X; ++x)
242 {
243 __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
Manuel Bottini79f88e62019-09-18 15:02:53 +0100244#ifdef IN_PLACE
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000245 __global DATA_TYPE *output_address = input_address;
Manuel Bottini79f88e62019-09-18 15:02:53 +0100246#else /* !IN_PLACE */
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000247 __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
Manuel Bottini79f88e62019-09-18 15:02:53 +0100248#endif /* IN_PLACE */
Georgios Pinitas55a687d2020-01-30 12:00:23 +0000249 *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000250 }
Manuel Bottini79f88e62019-09-18 15:02:53 +0100251 }
252#endif // defined(NHWC)
253}
Georgios Pinitas55a687d2020-01-30 12:00:23 +0000254#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */