blob: 96e2b15a9c29225ecafd5e09bd09bbb55cf9188b [file] [log] [blame]
Chunosovf450caa2017-11-08 16:09:35 +07001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Chunosovf450caa2017-11-08 16:09:35 +07003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Giorgio Arena4402cb92018-02-15 13:37:40 +000024#include "helpers_asymm.h"
Chunosovf450caa2017-11-08 16:09:35 +070025
26#define MAX_OP(x, y, type, size) max((x), (y))
27#define ADD_OP(x, y, type, size) ((x) + (y))
Sang-Hoon Park62eeb532019-10-29 13:13:19 +000028#define SUB_OP(x, y, type, size) ((x) - (y))
Chunosovf450caa2017-11-08 16:09:35 +070029
Giorgio Arena4402cb92018-02-15 13:37:40 +000030/* Number of workitems in dimension 0. */
31#if !defined(GRID_SIZE)
32#define GRID_SIZE 1
33#endif /* !defined(GRID_SIZE) */
Chunosovf450caa2017-11-08 16:09:35 +070034
Giorgio Arena4402cb92018-02-15 13:37:40 +000035#if VECTOR_SIZE == 2
36__constant uint2 idx__ = (uint2)(0, 1);
37#define asymm_mult(a, b) ASYMM_MULT(a, b, 2)
38#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 2)
39#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 2)
Chunosovf450caa2017-11-08 16:09:35 +070040
Giorgio Arena4402cb92018-02-15 13:37:40 +000041#elif VECTOR_SIZE == 4
42__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
43#define asymm_mult(a, b) ASYMM_MULT(a, b, 4)
44#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 4)
45#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 4)
Chunosovf450caa2017-11-08 16:09:35 +070046
Giorgio Arena4402cb92018-02-15 13:37:40 +000047#elif VECTOR_SIZE == 8
48__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
49#define asymm_mult(a, b) ASYMM_MULT(a, b, 8)
50#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 8)
51#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 8)
Chunosovf450caa2017-11-08 16:09:35 +070052
Giorgio Arena4402cb92018-02-15 13:37:40 +000053#else /* VECTOR_SIZE DEFAULT */
54#define VECTOR_SIZE 16
55#define LOG_VECTOR_SIZE 4
56__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
57#define asymm_mult(a, b) ASYMM_MULT(a, b, 16)
58#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 16)
59#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 16)
Chunosovf450caa2017-11-08 16:09:35 +070060
Giorgio Arena4402cb92018-02-15 13:37:40 +000061#endif /* VECTOR_SIZE END */
Chunosovf450caa2017-11-08 16:09:35 +070062
Giorgio Arena4402cb92018-02-15 13:37:40 +000063#define VEC_UCHAR VEC_DATA_TYPE(uchar, VECTOR_SIZE)
64#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
65#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
Sang-Hoon Park0779fec2019-11-13 17:08:12 +000066#define VEC_BASE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
Chunosovf450caa2017-11-08 16:09:35 +070067
68#if defined(DIFF_MIN)
69
Giorgio Arena4402cb92018-02-15 13:37:40 +000070VEC_INT mult_by_quantized_multiplier_serial(VEC_INT data)
Chunosovf450caa2017-11-08 16:09:35 +070071{
72#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
73 if(INPUT_BETA_MULTIPLIER > 1)
74 {
75 return asymm_mult(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER);
76 }
77#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
78 return data;
79}
80
Giorgio Arena4402cb92018-02-15 13:37:40 +000081int4 mult_by_quantized_multiplier_parallel(int4 data)
82{
83#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
84 if(INPUT_BETA_MULTIPLIER > 1)
85 {
86 return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 4);
87 }
88#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
89 return data;
90}
91
Chunosovf450caa2017-11-08 16:09:35 +070092/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
93 * then gets the exponent of each element as sums all elements across each row.
94 *
Giorgio Arena4402cb92018-02-15 13:37:40 +000095 * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
Chunosovf450caa2017-11-08 16:09:35 +070096 * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
97 * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
98 *
99 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QASYMM8
100 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
101 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
102 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
103 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
104 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
105 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
106 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
107 * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
108 * @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)
109 * @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
110 * @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)
111 * @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
112 * @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)
113 * @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
114 * @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor
115 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: S32
116 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
117 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
118 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
119 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
120 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
121 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
122 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
123 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr
124 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
125 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
126 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
127 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
128 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
129 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
130 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
131 * @param[in] width Input image width
132 */
Giorgio Arena4402cb92018-02-15 13:37:40 +0000133__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
Chunosovf450caa2017-11-08 16:09:35 +0700134 TENSOR3D_DECLARATION(src),
Giorgio Arena4402cb92018-02-15 13:37:40 +0000135 TENSOR3D_DECLARATION(maxo),
Chunosovf450caa2017-11-08 16:09:35 +0700136 TENSOR3D_DECLARATION(dst),
137 TENSOR3D_DECLARATION(sum),
138 uint width)
139{
Giorgio Arena4402cb92018-02-15 13:37:40 +0000140 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
141 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
142 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
143 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
Chunosovf450caa2017-11-08 16:09:35 +0700144
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000145 VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE);
Chunosovf450caa2017-11-08 16:09:35 +0700146
Giorgio Arena4402cb92018-02-15 13:37:40 +0000147 // Calculate max of row
148 const uint width4 = width >> LOG_VECTOR_SIZE;
Chunosovf450caa2017-11-08 16:09:35 +0700149 for(uint i = 0; i < width4; i++)
150 {
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000151 VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
152 max_val_vec = MAX_OP(data, max_val_vec, DATA_TYPE, 16);
Chunosovf450caa2017-11-08 16:09:35 +0700153 }
154
Giorgio Arena4402cb92018-02-15 13:37:40 +0000155#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
Chunosovf450caa2017-11-08 16:09:35 +0700156 // Handle non multiple of 16
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000157 VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
158 VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));
159 VEC_UCHAR widx = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_UCHAR);
160 max_val_vec = MAX_OP(max_val_vec, select(vec_min_val, data, widx), DATA_TYPE, 16);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000161#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
Chunosovf450caa2017-11-08 16:09:35 +0700162
Giorgio Arena4402cb92018-02-15 13:37:40 +0000163 // Perform max reduction
164#if VECTOR_SIZE == 16
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000165 max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000166#endif /* VECTOR SIZE 16 END */
167#if VECTOR_SIZE >= 8
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000168 max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000169#endif /* VECTOR SIZE 8 END */
170#if VECTOR_SIZE >= 4
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000171 max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000172#endif /* VECTOR SIZE 4 END */
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000173 max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000174
175 // Store result
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000176 *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
Giorgio Arena4402cb92018-02-15 13:37:40 +0000177
178 // Second part
179
180 // Load max value of 1D logits vector (row)
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000181 int max_val = convert_int(*((__global DATA_TYPE *)offset(&maxo, 0, 0)));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000182
183 // Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)
184 VEC_INT sum1D = 0;
185
186 // Shift values, exp and sum
187 for(uint i = 0; i < width4; i++)
188 {
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000189 VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000190 VEC_INT data_fp = CONVERT(data, VEC_INT);
191 VEC_INT data_diff = data_fp - max_val;
192 VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
193 data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
194 data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
195 VSTORE(VECTOR_SIZE)
196 (data_diff, 0, (__global int *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000197 sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000198 }
199
200#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
201 // Handle non multiple of 16
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000202 data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000203 VEC_INT data_fp = CONVERT(data, VEC_INT);
204 VEC_INT data_diff = data_fp - max_val;
205 VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
206 data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
207 data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
208 VEC_INT widx_ = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_INT);
209 VSTORE(VECTOR_SIZE)
210 (data_diff, 0, (__global int *)offset(&dst, width4 << LOG_VECTOR_SIZE, 0));
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000211 data_fp = select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
212 sum1D = sum1D + select(MIN_VALUE, data_fp, widx_);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000213#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
214
215 // Perform sum reduction
216#if VECTOR_SIZE == 16
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000217 sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000218#endif /* VECTOR SIZE 16 END */
219#if VECTOR_SIZE >= 8
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000220 sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000221#endif /* VECTOR SIZE 8 END */
222#if VECTOR_SIZE >= 4
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000223 sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000224#endif /* VECTOR SIZE 4 END */
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000225 sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
Chunosovf450caa2017-11-08 16:09:35 +0700226
227 // Calculate and store result
228 *((__global int *)sum.ptr) = sum1D.s0;
229}
230
Giorgio Arena4402cb92018-02-15 13:37:40 +0000231/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
232 * then gets the exponent of each element as sums all elements across each row.
233 *
234 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
Giorgio Arena4402cb92018-02-15 13:37:40 +0000235 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
Giorgio Arena4402cb92018-02-15 13:37:40 +0000236 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100237 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
Giorgio Arena4402cb92018-02-15 13:37:40 +0000238 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
239 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
240 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
241 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
242 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
243 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
244 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
245 * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
246 * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)
247 * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
248 * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)
249 * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
250 * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)
251 * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
252 * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
253 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
254 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
255 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
256 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
257 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
258 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
259 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
260 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
261 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
262 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
263 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
264 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
265 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
266 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
267 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
268 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
269 * @param[in] width Input image width
270 */
271__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
272 TENSOR3D_DECLARATION(src),
273 TENSOR3D_DECLARATION(maxo),
274 TENSOR3D_DECLARATION(dst),
275 TENSOR3D_DECLARATION(sum),
276 uint width)
277{
278 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
279 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
280 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
281 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
282
283 const uint4 idx4 = (uint4)(0, 1, 2, 3);
284 const uint lid = get_local_id(0);
285
286 // Define one temporary vector per work-item.
287 __local int4 tmp_local[GRID_SIZE];
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000288 __local DATA_TYPE max_local;
Giorgio Arena4402cb92018-02-15 13:37:40 +0000289
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000290 VEC_DATA_TYPE(DATA_TYPE, 4)
291 vec_min_val = (VEC_DATA_TYPE(DATA_TYPE, 4))(MIN_VALUE);
292 VEC_DATA_TYPE(DATA_TYPE, 4)
293 max_val_vec = vec_min_val;
Giorgio Arena4402cb92018-02-15 13:37:40 +0000294
295 // Number of elements per work-item.
296 const uint row = width / GRID_SIZE;
297 // Number of iterations per work-item.
298 const uint width_ = row >> 2;
299 // Calculate max of row
300 uint i = 0;
301 for(; i < width_; i++)
302 {
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000303 VEC_DATA_TYPE(DATA_TYPE, 4)
304 data_max = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
305 max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000306 }
307#ifdef NON_MULTIPLE_OF_GRID_SIZE
308 // How many work-items needed to complete the computation.
309 //TODO: Optimize this calculation (avoid %).
310 int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
311 if(lid < boundary_workitems)
312 {
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000313 VEC_DATA_TYPE(DATA_TYPE, 4)
314 data_max = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
315 max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000316 }
317#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
318 if(boundary_workitems == 0)
319 {
320 boundary_workitems = GRID_SIZE;
321 i--;
322 }
323 if(lid == (boundary_workitems - 1))
324 {
325 // Handle non multiple of 4
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000326 VEC_DATA_TYPE(DATA_TYPE, 4)
327 data_max = vload4(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
328 VEC_DATA_TYPE(DATA_TYPE, 4)
329 widx = CONVERT((((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width), VEC_DATA_TYPE(DATA_TYPE, 4));
330 max_val_vec = MAX_OP(max_val_vec, select(vec_min_val, data_max, widx), DATA_TYPE, 4);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000331 }
332#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
333#endif /* NON_MULTIPLE_OF_GRID_SIZE */
334 tmp_local[lid] = convert_int4(max_val_vec);
335
336 barrier(CLK_LOCAL_MEM_FENCE);
337
338 if(GRID_SIZE >= 256)
339 {
340 if(lid < 128)
341 {
342 tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);
343 }
344 barrier(CLK_LOCAL_MEM_FENCE);
345 }
346 if(GRID_SIZE >= 128)
347 {
348 if(lid < 64)
349 {
350 tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);
351 }
352 barrier(CLK_LOCAL_MEM_FENCE);
353 }
354 if(GRID_SIZE >= 64)
355 {
356 if(lid < 32)
357 {
358 tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);
359 }
360 barrier(CLK_LOCAL_MEM_FENCE);
361 }
362 if(GRID_SIZE >= 32)
363 {
364 if(lid < 16)
365 {
366 tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);
367 }
368 barrier(CLK_LOCAL_MEM_FENCE);
369 }
370 if(GRID_SIZE >= 16)
371 {
372 if(lid < 8)
373 {
374 tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);
375 }
376 barrier(CLK_LOCAL_MEM_FENCE);
377 }
378 if(GRID_SIZE >= 8)
379 {
380 if(lid < 4)
381 {
382 tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);
383 }
384 barrier(CLK_LOCAL_MEM_FENCE);
385 }
386 if(GRID_SIZE >= 4)
387 {
388 if(lid < 2)
389 {
390 tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);
391 }
392 barrier(CLK_LOCAL_MEM_FENCE);
393 }
394 if(lid == 0)
395 {
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000396 max_val_vec = MAX_OP(CONVERT((tmp_local[lid + 1]), VEC_DATA_TYPE(DATA_TYPE, 4)), CONVERT((tmp_local[lid]), VEC_DATA_TYPE(DATA_TYPE, 4)), DATA_TYPE, 4);
397 max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
398 max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000399 max_local = max_val_vec.s0;
400 }
401 barrier(CLK_LOCAL_MEM_FENCE);
402
403 /* Second section */
404
405 // Set sum vector
406 int4 sum1D = 0;
407 int max_val = convert_int(max_local);
408
409 // Shift values, exp and sum
410 for(i = 0; i < width_; i++)
411 {
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000412 VEC_DATA_TYPE(DATA_TYPE, 4)
413 data = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000414 int4 data_fp = convert_int4(data);
415 int4 data_diff = data_fp - max_val;
416 int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
417 data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
418 data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
419 vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000420 sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (int4)(DIFF_MIN));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000421 }
422#ifdef NON_MULTIPLE_OF_GRID_SIZE
423 //TODO: Optimize the calculation (avoid %).
424 boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
425 if(lid < boundary_workitems)
426 {
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000427 VEC_DATA_TYPE(DATA_TYPE, 4)
428 data = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000429 int4 data_fp = convert_int4(data);
430 int4 data_diff = data_fp - max_val;
431 int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
432 data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
433 data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
434 vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000435 sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (int4)(DIFF_MIN));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000436 }
437#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
438 if(boundary_workitems == 0)
439 {
440 boundary_workitems = GRID_SIZE;
441 i--;
442 }
443 if(lid == (boundary_workitems - 1))
444 {
445 // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000446 VEC_DATA_TYPE(DATA_TYPE, 4)
447 data = vload4(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4 + 4, 0));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000448 int4 data_fp = convert_int4(data);
449 int4 data_diff = data_fp - max_val;
450 int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
451 data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
452 data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
Giorgio Arena72f39be2018-02-19 15:33:41 +0000453 int4 widx = convert_int4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000454 data_fp = select(MIN_VALUE, data_fp, widx);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000455 vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4 + 4, 0));
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000456 sum1D = sum1D + select(MIN_VALUE, data_fp, data_diff >= (int4)(DIFF_MIN));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000457 }
458#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
459#endif /* NON_MULTIPLE_OF_GRID_SIZE */
460 tmp_local[lid] = sum1D;
461
462 barrier(CLK_LOCAL_MEM_FENCE);
463
464 if(GRID_SIZE >= 256)
465 {
466 if(lid < 128)
467 {
468 tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);
469 }
470 barrier(CLK_LOCAL_MEM_FENCE);
471 }
472 if(GRID_SIZE >= 128)
473 {
474 if(lid < 64)
475 {
476 tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);
477 }
478 barrier(CLK_LOCAL_MEM_FENCE);
479 }
480 if(GRID_SIZE >= 64)
481 {
482 if(lid < 32)
483 {
484 tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);
485 }
486 barrier(CLK_LOCAL_MEM_FENCE);
487 }
488 if(GRID_SIZE >= 32)
489 {
490 if(lid < 16)
491 {
492 tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);
493 }
494 barrier(CLK_LOCAL_MEM_FENCE);
495 }
496 if(GRID_SIZE >= 16)
497 {
498 if(lid < 8)
499 {
500 tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);
501 }
502 barrier(CLK_LOCAL_MEM_FENCE);
503 }
504 if(GRID_SIZE >= 8)
505 {
506 if(lid < 4)
507 {
508 tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);
509 }
510 barrier(CLK_LOCAL_MEM_FENCE);
511 }
512 if(GRID_SIZE >= 4)
513 {
514 if(lid < 2)
515 {
516 tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);
517 }
518 barrier(CLK_LOCAL_MEM_FENCE);
519 }
520 if(lid == 0)
521 {
522 sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], int, 4);
523 // Perform max reduction
524 sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, int, 2);
525 sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, int, 1);
526 *((__global int *)sum.ptr) = sum1D.s0;
527 }
528}
529
Chunosovf450caa2017-11-08 16:09:35 +0700530/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
531 *
Chunosovf450caa2017-11-08 16:09:35 +0700532 * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
533 * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
534 *
535 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: S32
536 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
537 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
538 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
539 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
540 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
541 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
542 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
543 * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
544 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
545 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
546 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
547 * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
548 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
549 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
550 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
551 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: QASYMM8
552 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
553 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
554 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
555 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
556 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
557 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
558 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
559 */
560__kernel void softmax_layer_norm_quantized(
561 TENSOR3D_DECLARATION(src),
562 TENSOR3D_DECLARATION(sum),
563 TENSOR3D_DECLARATION(dst))
564{
565 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
566 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
567 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
568
569 // Load max value of 1D logits vector (row)
570 int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
571
572 // It will be better to calculate this in prev layer and pass here as parameter
Giorgio Arenaaf724e72019-11-12 14:45:14 +0000573 uint sum_val_u = convert_uint(sum_val);
Chunosovf450caa2017-11-08 16:09:35 +0700574 int headroom_plus_one = clz(sum_val_u);
575 int num_bits_over_unit = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
576 int shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
577 int16 shifted_sum_minus_one = shifted_sum_minus_one_1;
Giorgio Arena4402cb92018-02-15 13:37:40 +0000578 int16 shifted_scale = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, 16);
Chunosovf450caa2017-11-08 16:09:35 +0700579
580 // It was already calculated in prev layer, should be stored into tmp output and reused
581 int16 data_diff = vload16(0, (__global int *)offset(&src, 0, 0));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000582 int16 data_diff_mult = data_diff;
583#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
584 if(INPUT_BETA_MULTIPLIER > 1)
585 {
586 data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 16);
587 }
588#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
Chunosovf450caa2017-11-08 16:09:35 +0700589
Giorgio Arenaaf724e72019-11-12 14:45:14 +0000590 int16 data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 16);
591 data = ASYMM_MULT(shifted_scale, data, 16);
592 data = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, 16);
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000593#ifdef QASYMM8_SIGNED
Sang-Hoon Parka0205b92020-07-07 09:36:09 +0100594 data = ADD_OP(data, (int16)(MIN_VALUE), int, 16);
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000595#endif /* QASYMM8_SIGNED */
Sang-Hoon Parka0205b92020-07-07 09:36:09 +0100596 data = select(MIN_VALUE, data, data_diff >= (int16)(DIFF_MIN));
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000597 vstore16(CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
Chunosovf450caa2017-11-08 16:09:35 +0700598}
599
600#endif /* defined(DIFF_MIN) */