Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 1 | /* |
Viet-Hoa Do | 6829e02 | 2024-01-16 16:23:24 +0000 | [diff] [blame] | 2 | * Copyright (c) 2019-2021, 2024 Arm Limited. |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 3 | * |
| 4 | * SPDX-License-Identifier: MIT |
| 5 | * |
| 6 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 7 | * of this software and associated documentation files (the "Software"), to |
| 8 | * deal in the Software without restriction, including without limitation the |
| 9 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or |
| 10 | * sell copies of the Software, and to permit persons to whom the Software is |
| 11 | * furnished to do so, subject to the following conditions: |
| 12 | * |
| 13 | * The above copyright notice and this permission notice shall be included in all |
| 14 | * copies or substantial portions of the Software. |
| 15 | * |
| 16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 18 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 19 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 20 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 21 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 22 | * SOFTWARE. |
| 23 | */ |
| 24 | #include "helpers.h" |
| 25 | |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 26 | #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) & defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 27 | /** This function computes the mean and variance of each plane of the input tensor and provides it as output. |
| 28 | * |
| 29 | * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 |
| 30 | * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float |
| 31 | * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7 |
| 32 | * |
| 33 | * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 |
| 34 | * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) |
| 35 | * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) |
| 36 | * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) |
| 37 | * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) |
| 38 | * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) |
| 39 | * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) |
| 40 | * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) |
| 41 | * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) |
| 42 | * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor |
| 43 | * @param[out] output_ptr (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr |
| 44 | * @param[in] output_stride_x (Optional) Stride of the destination tensor in X dimension (in bytes) |
| 45 | * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes) |
| 46 | * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y dimension (in bytes) |
| 47 | * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes) |
| 48 | * @param[in] output_stride_z (Optional) Stride of the destination tensor in Z dimension (in bytes) |
| 49 | * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes) |
| 50 | * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor |
| 51 | */ |
| 52 | __kernel void compute_mean_var( |
| 53 | TENSOR4D_DECLARATION(input), |
| 54 | TENSOR3D_DECLARATION(output)) |
| 55 | { |
Viet-Hoa Do | 6829e02 | 2024-01-16 16:23:24 +0000 | [diff] [blame] | 56 | Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input); |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 57 | Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output); |
| 58 | |
| 59 | #if defined(NHWC) |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 60 | const int ch = get_global_id(0); // Current channel |
| 61 | const int batch = get_global_id(1); // Current batch |
| 62 | const int elements_plane = DIM_Y * DIM_Z; |
| 63 | INTERNAL_DATA_TYPE part_sum = 0.f; |
| 64 | INTERNAL_DATA_TYPE part_sum_sq = 0.f; |
| 65 | const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE); |
| 66 | |
| 67 | for(int i_w = 0; i_w < DIM_Y; ++i_w) |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 68 | { |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 69 | for(int i_h = 0; i_h < DIM_Z; ++i_h) |
| 70 | { |
| 71 | INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE) * ((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch)); |
| 72 | part_sum += data; |
| 73 | part_sum_sq += data * data; |
| 74 | } |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 75 | } |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 76 | |
| 77 | INTERNAL_DATA_TYPE mean = (part_sum / elements_plane); |
| 78 | INTERNAL_DATA_TYPE var = (part_sum_sq / elements_plane) - (mean * mean); |
| 79 | __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch); |
| 80 | *output_address0 = mean; |
| 81 | __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch); |
| 82 | *output_address1 = var; |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 83 | #else // !defined(NHWC) |
| 84 | const int ch = get_global_id(2) % DIM_Z; // Current channel |
| 85 | const int batch = get_global_id(2) / DIM_Z; // Current batch |
| 86 | const int elements_plane = DIM_X * DIM_Y; |
| 87 | |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 88 | VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE) |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 89 | part_sum = 0.f; |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 90 | VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE) |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 91 | part_sum_sq = 0.f; |
| 92 | // Calculate partial sum |
| 93 | for(int y = 0; y < DIM_Y; ++y) |
| 94 | { |
| 95 | int x = 0; |
| 96 | for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) |
| 97 | { |
| 98 | // Load data |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 99 | VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE) |
| 100 | data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)); |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 101 | part_sum += data; |
| 102 | part_sum_sq += data * data; |
| 103 | } |
| 104 | // Left-overs loop |
| 105 | for(; x < DIM_X; ++x) |
| 106 | { |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 107 | INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch))); |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 108 | part_sum.s0 += data; |
| 109 | part_sum_sq.s0 += data * data; |
| 110 | } |
| 111 | } |
| 112 | // Perform reduction |
| 113 | #if VEC_SIZE > 8 |
| 114 | part_sum.s01234567 += part_sum.s89abcdef; |
| 115 | part_sum_sq.s01234567 += part_sum_sq.s89abcdef; |
| 116 | #endif // VEC_SIZE > 8 |
| 117 | #if VEC_SIZE > 4 |
| 118 | part_sum.s0123 += part_sum.s4567; |
| 119 | part_sum_sq.s0123 += part_sum_sq.s4567; |
| 120 | #endif // VEC_SIZE > 4 |
| 121 | #if VEC_SIZE > 2 |
| 122 | part_sum.s01 += part_sum.s23; |
| 123 | part_sum_sq.s01 += part_sum_sq.s23; |
| 124 | #endif // VEC_SIZE > 2 |
| 125 | part_sum.s0 += part_sum.s1; |
| 126 | part_sum_sq.s0 += part_sum_sq.s1; |
| 127 | |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 128 | INTERNAL_DATA_TYPE sum = (INTERNAL_DATA_TYPE)part_sum.s0; |
| 129 | INTERNAL_DATA_TYPE sum_sq = (INTERNAL_DATA_TYPE)part_sum_sq.s0; |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 130 | |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 131 | const INTERNAL_DATA_TYPE mean = (sum / elements_plane); |
| 132 | const INTERNAL_DATA_TYPE var = (sum_sq / elements_plane) - (mean * mean); |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 133 | |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 134 | __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch); |
| 135 | *output_address0 = mean; |
| 136 | __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch); |
| 137 | *output_address1 = var; |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 138 | |
| 139 | #endif // defined(NHWC) |
| 140 | } |
| 141 | #endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */ |
| 142 | |
Georgios Pinitas | 55a687d | 2020-01-30 12:00:23 +0000 | [diff] [blame] | 143 | #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 144 | /** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension. |
| 145 | * |
| 146 | * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 |
| 147 | * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float |
| 148 | * @attention The scale scalar value applied to the normalized tensor should be passed using the -DGAMMA=value compile flag, e.g. -DGAMMA=1.3 |
| 149 | * @attention The offset scalar value applied to the normalized tensor should be passed using the -DBETA=value compile flag, e.g. -DBETA=2.4 |
| 150 | * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f |
| 151 | * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7 |
| 152 | * |
| 153 | * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 |
| 154 | * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) |
| 155 | * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) |
| 156 | * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) |
| 157 | * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) |
| 158 | * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) |
| 159 | * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) |
| 160 | * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor |
| 161 | * @param[out] output_ptr (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr |
| 162 | * @param[in] output_stride_x (Optional) Stride of the destination tensor in X dimension (in bytes) |
| 163 | * @param[in] output_step_x (Optional) output_stride_x * number of elements along X processed per workitem(in bytes) |
| 164 | * @param[in] output_stride_y (Optional) Stride of the destination tensor in Y dimension (in bytes) |
| 165 | * @param[in] output_step_y (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes) |
| 166 | * @param[in] output_stride_z (Optional) Stride of the destination tensor in Z dimension (in bytes) |
| 167 | * @param[in] output_step_z (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes) |
| 168 | * @param[in] output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor |
| 169 | */ |
| 170 | __kernel void instance_normalization( |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 171 | TENSOR4D_DECLARATION(input), |
| 172 | TENSOR3D_DECLARATION(mean_var) |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 173 | #ifndef IN_PLACE |
| 174 | , |
| 175 | TENSOR4D_DECLARATION(output) |
| 176 | #endif /* IN_PLACE */ |
| 177 | ) |
| 178 | { |
Viet-Hoa Do | 6829e02 | 2024-01-16 16:23:24 +0000 | [diff] [blame] | 179 | Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input); |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 180 | Tensor3D mean_var = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(mean_var); |
Michele Di Giorgio | a0a3d20 | 2019-11-20 15:06:52 +0000 | [diff] [blame] | 181 | #ifndef IN_PLACE |
Viet-Hoa Do | 6829e02 | 2024-01-16 16:23:24 +0000 | [diff] [blame] | 182 | Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output); |
Michele Di Giorgio | a0a3d20 | 2019-11-20 15:06:52 +0000 | [diff] [blame] | 183 | #endif /* IN_PLACE */ |
| 184 | |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 185 | #if defined(NHWC) |
| 186 | const int ch = get_global_id(0); // Current channel |
| 187 | const int batch = get_global_id(2); // Current batch |
| 188 | #else /* defined(NHWC) */ |
| 189 | const int ch = get_global_id(2) % DIM_Z; // Current channel |
| 190 | const int batch = get_global_id(2) / DIM_Z; // Current batch |
| 191 | #endif /* defined(NHWC) */ |
| 192 | |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 193 | const __global INTERNAL_DATA_TYPE *mean_ptr = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 0, batch); |
| 194 | const __global INTERNAL_DATA_TYPE *var_ptr = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 1, batch); |
| 195 | const INTERNAL_DATA_TYPE mean = (INTERNAL_DATA_TYPE) * mean_ptr; |
| 196 | const INTERNAL_DATA_TYPE var = (INTERNAL_DATA_TYPE) * var_ptr; |
| 197 | const INTERNAL_DATA_TYPE multip = GAMMA / sqrt(var + EPSILON); |
| 198 | const INTERNAL_DATA_TYPE beta = (INTERNAL_DATA_TYPE)BETA; |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 199 | |
| 200 | #if defined(NHWC) |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 201 | const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE); |
| 202 | #ifndef IN_PLACE |
| 203 | const int out_offset = output_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE); |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 204 | #endif /* IN_PLACE */ |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 205 | |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 206 | for(int i_w = 0; i_w < DIM_Y; ++i_w) |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 207 | { |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 208 | for(int i_h = 0; i_h < DIM_Z; ++i_h) |
| 209 | { |
| 210 | __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch); |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 211 | #ifdef IN_PLACE |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 212 | __global DATA_TYPE *output_address = input_address; |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 213 | #else /* !IN_PLACE */ |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 214 | __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch); |
Pablo Marquez Tello | fe7ae81 | 2021-03-03 12:12:35 +0000 | [diff] [blame] | 215 | #endif /* IN_PLACE */ |
Pablo Tello | 5c3eeec | 2021-04-26 15:39:05 +0100 | [diff] [blame] | 216 | *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA; |
| 217 | } |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 218 | } |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 219 | #else // !defined(NHWC) |
Michele Di Giorgio | a0a3d20 | 2019-11-20 15:06:52 +0000 | [diff] [blame] | 220 | for(int y = 0; y < DIM_Y; ++y) |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 221 | { |
Michele Di Giorgio | a0a3d20 | 2019-11-20 15:06:52 +0000 | [diff] [blame] | 222 | int x = 0; |
| 223 | for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE) |
| 224 | { |
| 225 | __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 226 | #ifdef IN_PLACE |
Michele Di Giorgio | a0a3d20 | 2019-11-20 15:06:52 +0000 | [diff] [blame] | 227 | __global DATA_TYPE *output_address = input_address; |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 228 | #else /* !IN_PLACE */ |
Michele Di Giorgio | a0a3d20 | 2019-11-20 15:06:52 +0000 | [diff] [blame] | 229 | __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 230 | #endif /* IN_PLACE */ |
| 231 | |
Georgios Pinitas | 55a687d | 2020-01-30 12:00:23 +0000 | [diff] [blame] | 232 | VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE) |
| 233 | data = CONVERT(VLOAD(VEC_SIZE)(0, input_address), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)); |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 234 | |
Georgios Pinitas | 55a687d | 2020-01-30 12:00:23 +0000 | [diff] [blame] | 235 | VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE) |
| 236 | res = (data - mean) * multip + (INTERNAL_DATA_TYPE)BETA; |
Michele Di Giorgio | a0a3d20 | 2019-11-20 15:06:52 +0000 | [diff] [blame] | 237 | VSTORE(VEC_SIZE) |
Georgios Pinitas | 55a687d | 2020-01-30 12:00:23 +0000 | [diff] [blame] | 238 | (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, output_address); |
Michele Di Giorgio | a0a3d20 | 2019-11-20 15:06:52 +0000 | [diff] [blame] | 239 | } |
| 240 | // Left-overs loop |
| 241 | for(; x < DIM_X; ++x) |
| 242 | { |
| 243 | __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch); |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 244 | #ifdef IN_PLACE |
Michele Di Giorgio | a0a3d20 | 2019-11-20 15:06:52 +0000 | [diff] [blame] | 245 | __global DATA_TYPE *output_address = input_address; |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 246 | #else /* !IN_PLACE */ |
Michele Di Giorgio | a0a3d20 | 2019-11-20 15:06:52 +0000 | [diff] [blame] | 247 | __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch); |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 248 | #endif /* IN_PLACE */ |
Georgios Pinitas | 55a687d | 2020-01-30 12:00:23 +0000 | [diff] [blame] | 249 | *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA; |
Michele Di Giorgio | a0a3d20 | 2019-11-20 15:06:52 +0000 | [diff] [blame] | 250 | } |
Manuel Bottini | 79f88e6 | 2019-09-18 15:02:53 +0100 | [diff] [blame] | 251 | } |
| 252 | #endif // defined(NHWC) |
| 253 | } |
Georgios Pinitas | 55a687d | 2020-01-30 12:00:23 +0000 | [diff] [blame] | 254 | #endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */ |