blob: d2507d94dd93218c69c5ef29c519a5e309e746ca [file] [log] [blame]
Manuel Bottini79f88e62019-09-18 15:02:53 +01001/*
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +00002 * Copyright (c) 2019-2021 Arm Limited.
Manuel Bottini79f88e62019-09-18 15:02:53 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +000026#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
27/** This function computes the mean and variance of each plane of the input tensor and provides it as output.
28 *
29 * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
30 * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
31 * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
32 *
33 * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
34 * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
35 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
36 * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
37 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
38 * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
39 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
40 * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
41 * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
42 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
43 * @param[out] output_ptr (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
44 * @param[in] output_stride_x (Optional) Stride of the destination tensor in X dimension (in bytes)
45 * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
46 * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y dimension (in bytes)
47 * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
48 * @param[in] output_stride_z (Optional) Stride of the destination tensor in Z dimension (in bytes)
49 * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
50 * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
51 */
52__kernel void compute_mean_var(
53 TENSOR4D_DECLARATION(input),
54 TENSOR3D_DECLARATION(output))
55{
56 Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
57 Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
58
59#if defined(NHWC)
60 const int ch = get_global_id(0); // Current channel
61 const int batch = get_global_id(1); // Current batch
62 const int elements_plane = DIM_Y * DIM_Z;
63 float part_sum = 0.f;
64 float part_sum_sq = 0.f;
65 const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
66 for(int i = 0; i < (DIM_Y * DIM_Z); ++i)
67 {
68 const float data = *((__global DATA_TYPE *)(input_ptr + in_offset + i * input_stride_y));
69 part_sum += data;
70 part_sum_sq += data * data;
71 }
72 float mean = (part_sum / elements_plane);
73 float var = (part_sum_sq / elements_plane) - (mean * mean);
74 __global DATA_TYPE *output_address0 = (__global DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
75 *output_address0 = mean;
76 __global DATA_TYPE *output_address1 = (__global DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
77 *output_address1 = var;
78#else // !defined(NHWC)
79 const int ch = get_global_id(2) % DIM_Z; // Current channel
80 const int batch = get_global_id(2) / DIM_Z; // Current batch
81 const int elements_plane = DIM_X * DIM_Y;
82
83 VEC_DATA_TYPE(float, VEC_SIZE)
84 part_sum = 0.f;
85 VEC_DATA_TYPE(float, VEC_SIZE)
86 part_sum_sq = 0.f;
87 // Calculate partial sum
88 for(int y = 0; y < DIM_Y; ++y)
89 {
90 int x = 0;
91 for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
92 {
93 // Load data
94 VEC_DATA_TYPE(float, VEC_SIZE)
95 data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)), VEC_DATA_TYPE(float, VEC_SIZE));
96 part_sum += data;
97 part_sum_sq += data * data;
98 }
99 // Left-overs loop
100 for(; x < DIM_X; ++x)
101 {
102 float data = (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)));
103 part_sum.s0 += data;
104 part_sum_sq.s0 += data * data;
105 }
106 }
107 // Perform reduction
108#if VEC_SIZE > 8
109 part_sum.s01234567 += part_sum.s89abcdef;
110 part_sum_sq.s01234567 += part_sum_sq.s89abcdef;
111#endif // VEC_SIZE > 8
112#if VEC_SIZE > 4
113 part_sum.s0123 += part_sum.s4567;
114 part_sum_sq.s0123 += part_sum_sq.s4567;
115#endif // VEC_SIZE > 4
116#if VEC_SIZE > 2
117 part_sum.s01 += part_sum.s23;
118 part_sum_sq.s01 += part_sum_sq.s23;
119#endif // VEC_SIZE > 2
120 part_sum.s0 += part_sum.s1;
121 part_sum_sq.s0 += part_sum_sq.s1;
122
123 float sum = (float)part_sum.s0;
124 float sum_sq = (float)part_sum_sq.s0;
125
126 const float mean = (sum / elements_plane);
127 const float var = (sum_sq / elements_plane) - (mean * mean);
128
129 __global DATA_TYPE *output_address0 = (__global DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
130 *output_address0 = mean;
131 __global DATA_TYPE *output_address1 = (__global DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
132 *output_address1 = var;
133
134#endif // defined(NHWC)
135}
136#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */
137
Georgios Pinitas55a687d2020-01-30 12:00:23 +0000138#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
Manuel Bottini79f88e62019-09-18 15:02:53 +0100139/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
140 *
141 * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
142 * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
143 * @attention The scale scalar value applied to the normalized tensor should be passed using the -DGAMMA=value compile flag, e.g. -DGAMMA=1.3
144 * @attention The offset scalar value applied to the normalized tensor should be passed using the -DBETA=value compile flag, e.g. -DBETA=2.4
145 * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
146 * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
147 *
148 * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
149 * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
150 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
151 * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
152 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
153 * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
154 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
155 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
156 * @param[out] output_ptr (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
157 * @param[in] output_stride_x (Optional) Stride of the destination tensor in X dimension (in bytes)
158 * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
159 * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y dimension (in bytes)
160 * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
161 * @param[in] output_stride_z (Optional) Stride of the destination tensor in Z dimension (in bytes)
162 * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
163 * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
164 */
165__kernel void instance_normalization(
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000166 TENSOR4D_DECLARATION(input),
167 TENSOR3D_DECLARATION(mean_var)
Manuel Bottini79f88e62019-09-18 15:02:53 +0100168#ifndef IN_PLACE
169 ,
170 TENSOR4D_DECLARATION(output)
171#endif /* IN_PLACE */
172)
173{
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000174 Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
175 Tensor3D mean_var = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(mean_var);
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000176#ifndef IN_PLACE
177 Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
178#endif /* IN_PLACE */
179
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000180#if defined(NHWC)
181 const int ch = get_global_id(0); // Current channel
182 const int batch = get_global_id(2); // Current batch
183#else /* defined(NHWC) */
184 const int ch = get_global_id(2) % DIM_Z; // Current channel
185 const int batch = get_global_id(2) / DIM_Z; // Current batch
186#endif /* defined(NHWC) */
187
188 const __global DATA_TYPE *mean_ptr = (__global DATA_TYPE *)tensor3D_offset(&mean_var, ch, 0, batch);
189 const __global DATA_TYPE *var_ptr = (__global DATA_TYPE *)tensor3D_offset(&mean_var, ch, 1, batch);
190 const INTERNAL_DATA_TYPE mean = (INTERNAL_DATA_TYPE) * mean_ptr;
191 const INTERNAL_DATA_TYPE var = (INTERNAL_DATA_TYPE) * var_ptr;
192 const INTERNAL_DATA_TYPE multip = GAMMA / sqrt(var + EPSILON);
193 const INTERNAL_DATA_TYPE beta = (INTERNAL_DATA_TYPE)BETA;
Manuel Bottini79f88e62019-09-18 15:02:53 +0100194
195#if defined(NHWC)
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000196 const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
197#ifndef IN_PLACE
198 const int out_offset = output_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
Manuel Bottini79f88e62019-09-18 15:02:53 +0100199#endif /* IN_PLACE */
Pablo Marquez Tellofe7ae812021-03-03 12:12:35 +0000200
201 for(int i = 0; i < (DIM_Y * DIM_Z); ++i)
202 {
203 __global DATA_TYPE *input_address = (__global DATA_TYPE *)(input_ptr + in_offset + i * input_stride_y);
204#ifdef IN_PLACE
205 __global DATA_TYPE *output_address = input_address;
206#else /* !IN_PLACE */
207 __global DATA_TYPE *output_address = (__global DATA_TYPE *)(output_ptr + out_offset + i * output_stride_y);
208#endif /* IN_PLACE */
209 *(output_address) = (*(input_address) - mean) * multip + beta;
Manuel Bottini79f88e62019-09-18 15:02:53 +0100210 }
211
212#else // !defined(NHWC)
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000213 for(int y = 0; y < DIM_Y; ++y)
Manuel Bottini79f88e62019-09-18 15:02:53 +0100214 {
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000215 int x = 0;
216 for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
217 {
218 __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
Manuel Bottini79f88e62019-09-18 15:02:53 +0100219#ifdef IN_PLACE
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000220 __global DATA_TYPE *output_address = input_address;
Manuel Bottini79f88e62019-09-18 15:02:53 +0100221#else /* !IN_PLACE */
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000222 __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
Manuel Bottini79f88e62019-09-18 15:02:53 +0100223#endif /* IN_PLACE */
224
Georgios Pinitas55a687d2020-01-30 12:00:23 +0000225 VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
226 data = CONVERT(VLOAD(VEC_SIZE)(0, input_address), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE));
Manuel Bottini79f88e62019-09-18 15:02:53 +0100227
Georgios Pinitas55a687d2020-01-30 12:00:23 +0000228 VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
229 res = (data - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000230 VSTORE(VEC_SIZE)
Georgios Pinitas55a687d2020-01-30 12:00:23 +0000231 (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, output_address);
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000232 }
233 // Left-overs loop
234 for(; x < DIM_X; ++x)
235 {
236 __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
Manuel Bottini79f88e62019-09-18 15:02:53 +0100237#ifdef IN_PLACE
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000238 __global DATA_TYPE *output_address = input_address;
Manuel Bottini79f88e62019-09-18 15:02:53 +0100239#else /* !IN_PLACE */
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000240 __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
Manuel Bottini79f88e62019-09-18 15:02:53 +0100241#endif /* IN_PLACE */
Georgios Pinitas55a687d2020-01-30 12:00:23 +0000242 *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
Michele Di Giorgioa0a3d202019-11-20 15:06:52 +0000243 }
Manuel Bottini79f88e62019-09-18 15:02:53 +0100244 }
245#endif // defined(NHWC)
246}
Georgios Pinitas55a687d2020-01-30 12:00:23 +0000247#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */