blob: b7a6e00dfa99f3928f726f92d5ca67fe304f5192 [file] [log] [blame]
Chunosovf450caa2017-11-08 16:09:35 +07001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Chunosovf450caa2017-11-08 16:09:35 +07003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Giorgio Arena4402cb92018-02-15 13:37:40 +000024#include "helpers_asymm.h"
Chunosovf450caa2017-11-08 16:09:35 +070025
Giorgio Arena2d1a8352020-10-26 15:04:08 +000026#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(DIFF_MIN)
27
28#define VEC_BASE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
29#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
30
31/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
32 *
33 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
34 * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
35 * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
36 * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
37 * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
38 * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
39 * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
40 * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
41 *
42 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: S32
43 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
44 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
45 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
46 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
47 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
48 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
49 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
50 * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
51 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
52 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
53 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
54 * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
55 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
56 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
57 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
58 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
59 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
60 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
61 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
62 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
63 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
64 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
65 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
66 */
67__kernel void softmax_layer_norm_quantized(
68 TENSOR3D_DECLARATION(src),
69 TENSOR3D_DECLARATION(sum),
70 TENSOR3D_DECLARATION(dst))
71{
72 const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
73
74 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
75 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
76
77 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
78
79 // Load max value of 1D logits vector (row)
80 int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
81
82 // It will be better to calculate this in prev layer and pass here as parameter
83 uint sum_val_u = convert_uint(sum_val);
84 int headroom_plus_one = clz(sum_val_u);
85 int num_bits_over_unit = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
86 int shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
87 VEC_INT shifted_sum_minus_one = shifted_sum_minus_one_1;
88 VEC_INT shifted_scale = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, VECTOR_SIZE);
89
90 // It was already calculated in prev layer, should be stored into tmp output and reused
91 VEC_INT data_diff = VLOAD(VECTOR_SIZE)(0, (__global int *)src_addr);
92 VEC_INT data_diff_mult = data_diff;
93#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
94 if(INPUT_BETA_MULTIPLIER > 1)
95 {
96 data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
97 }
98#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
99
100 VEC_INT data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
101 data = ASYMM_MULT(shifted_scale, data, VECTOR_SIZE);
102 data = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, VECTOR_SIZE);
103#ifdef QASYMM8_SIGNED
104 data += (VEC_INT)(MIN_VALUE);
105#endif /* QASYMM8_SIGNED */
106 data = select(MIN_VALUE, data, data_diff >= (VEC_INT)(DIFF_MIN));
107 VEC_BASE data0 = CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE));
108
109 STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
110}
111
112#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
Chunosovf450caa2017-11-08 16:09:35 +0700113
Giorgio Arena4402cb92018-02-15 13:37:40 +0000114/* Number of workitems in dimension 0. */
115#if !defined(GRID_SIZE)
116#define GRID_SIZE 1
117#endif /* !defined(GRID_SIZE) */
Chunosovf450caa2017-11-08 16:09:35 +0700118
Giorgio Arena4402cb92018-02-15 13:37:40 +0000119#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
Chunosovf450caa2017-11-08 16:09:35 +0700120
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000121VEC_INT mult_by_quantized_multiplier(VEC_INT data)
Chunosovf450caa2017-11-08 16:09:35 +0700122{
123#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
124 if(INPUT_BETA_MULTIPLIER > 1)
125 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000126 return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000127 }
128#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
129 return data;
130}
131
Chunosovf450caa2017-11-08 16:09:35 +0700132/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
133 * then gets the exponent of each element as sums all elements across each row.
134 *
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000135 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
136 * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
137 * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
138 * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
139 * @note In case the input is not multiple of VECTOR_SIZE -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
Chunosovf450caa2017-11-08 16:09:35 +0700140 * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000141 * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
Chunosovf450caa2017-11-08 16:09:35 +0700142 * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000143 * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
Chunosovf450caa2017-11-08 16:09:35 +0700144 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100145 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
Chunosovf450caa2017-11-08 16:09:35 +0700146 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
147 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
148 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
149 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
150 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
151 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
152 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
153 * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
154 * @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)
155 * @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
156 * @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)
157 * @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
158 * @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)
159 * @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
160 * @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor
161 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: S32
162 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
163 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
164 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
165 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
166 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
167 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
168 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
169 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr
170 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
171 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
172 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
173 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
174 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
175 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
176 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
Chunosovf450caa2017-11-08 16:09:35 +0700177 */
Giorgio Arena4402cb92018-02-15 13:37:40 +0000178__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
Chunosovf450caa2017-11-08 16:09:35 +0700179 TENSOR3D_DECLARATION(src),
Giorgio Arena4402cb92018-02-15 13:37:40 +0000180 TENSOR3D_DECLARATION(maxo),
Chunosovf450caa2017-11-08 16:09:35 +0700181 TENSOR3D_DECLARATION(dst),
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000182 TENSOR3D_DECLARATION(sum))
Chunosovf450caa2017-11-08 16:09:35 +0700183{
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000184 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
185 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
186
Giorgio Arena4402cb92018-02-15 13:37:40 +0000187 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
188 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
Chunosovf450caa2017-11-08 16:09:35 +0700189
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000190 VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE);
Chunosovf450caa2017-11-08 16:09:35 +0700191
Giorgio Arena4402cb92018-02-15 13:37:40 +0000192 // Calculate max of row
Giorgio Arena4402cb92018-02-15 13:37:40 +0000193#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000194 VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000195 VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
196 VEC_INT widx = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
197 max_val_vec = max(max_val_vec, select(vec_min_val, data, CONVERT(widx, VEC_BASE)));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000198#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
Chunosovf450caa2017-11-08 16:09:35 +0700199
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000200 for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
201 {
202 VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
203 max_val_vec = max(data, max_val_vec);
204 }
Giorgio Arena4402cb92018-02-15 13:37:40 +0000205
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000206 // Perform max reduction
207 DATA_TYPE max_local = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
208 *((__global DATA_TYPE *)maxo.ptr) = max_local;
Giorgio Arena4402cb92018-02-15 13:37:40 +0000209
210 // Second part
211
212 // Load max value of 1D logits vector (row)
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000213 int max_val = convert_int(max_local);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000214
215 // Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)
216 VEC_INT sum1D = 0;
217
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000218#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
219 VEC_INT data_fp = CONVERT(data, VEC_INT);
220 VEC_INT data_diff = data_fp - max_val;
221 VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
222 data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
223 data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
224 VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
225 (data_diff, 0, (__global int *)dst_addr);
226 data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
227 sum1D += select(0, data_fp, widx);
228#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
229
Giorgio Arena4402cb92018-02-15 13:37:40 +0000230 // Shift values, exp and sum
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000231 for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
Giorgio Arena4402cb92018-02-15 13:37:40 +0000232 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000233 VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000234 VEC_INT data_fp = CONVERT(data, VEC_INT);
235 VEC_INT data_diff = data_fp - max_val;
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000236 VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
237 data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
238 data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000239 VSTORE(VECTOR_SIZE)
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000240 (data_diff, 0, (__global int *)(dst_addr + i * sizeof(int)));
Sang-Hoon Park77d3d242020-08-10 22:50:17 +0100241 sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000242 }
243
Giorgio Arena4402cb92018-02-15 13:37:40 +0000244 // Perform sum reduction
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000245 *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
Chunosovf450caa2017-11-08 16:09:35 +0700246}
247
Giorgio Arena4402cb92018-02-15 13:37:40 +0000248/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
249 * then gets the exponent of each element as sums all elements across each row.
250 *
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000251 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
252 * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
253 * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
254 * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
Giorgio Arena4402cb92018-02-15 13:37:40 +0000255 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000256 * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
257 * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
258 * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
259 * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
Giorgio Arena4402cb92018-02-15 13:37:40 +0000260 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100261 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
Giorgio Arena4402cb92018-02-15 13:37:40 +0000262 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
263 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
264 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
265 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
266 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
267 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
268 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
269 * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
270 * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)
271 * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
272 * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)
273 * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
274 * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)
275 * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
276 * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
277 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
278 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
279 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
280 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
281 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
282 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
283 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
284 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
285 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
286 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
287 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
288 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
289 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
290 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
291 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
292 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
Giorgio Arena4402cb92018-02-15 13:37:40 +0000293 */
294__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
295 TENSOR3D_DECLARATION(src),
296 TENSOR3D_DECLARATION(maxo),
297 TENSOR3D_DECLARATION(dst),
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000298 TENSOR3D_DECLARATION(sum))
Giorgio Arena4402cb92018-02-15 13:37:40 +0000299{
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000300 const uint lid = get_local_id(0);
301 const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE);
302
303 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
304 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
305
Giorgio Arena4402cb92018-02-15 13:37:40 +0000306 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
307 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
308
Giorgio Arena4402cb92018-02-15 13:37:40 +0000309 // Define one temporary vector per work-item.
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000310 __local VEC_INT tmp_local[GRID_SIZE];
Sang-Hoon Park0779fec2019-11-13 17:08:12 +0000311 __local DATA_TYPE max_local;
Giorgio Arena4402cb92018-02-15 13:37:40 +0000312
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000313 VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
314 VEC_BASE max_val_vec = vec_min_val;
Giorgio Arena4402cb92018-02-15 13:37:40 +0000315
Giorgio Arena4402cb92018-02-15 13:37:40 +0000316 // Number of iterations per work-item.
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000317 const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
Giorgio Arena4402cb92018-02-15 13:37:40 +0000318 // Calculate max of row
319 uint i = 0;
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000320 for(; i < width; ++i)
Giorgio Arena4402cb92018-02-15 13:37:40 +0000321 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000322 VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
323 max_val_vec = max(data_max, max_val_vec);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000324 }
325#ifdef NON_MULTIPLE_OF_GRID_SIZE
326 // How many work-items needed to complete the computation.
327 //TODO: Optimize this calculation (avoid %).
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000328 int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
Giorgio Arena4402cb92018-02-15 13:37:40 +0000329 if(lid < boundary_workitems)
330 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000331 VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
332 max_val_vec = max(data_max, max_val_vec);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000333 }
334#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000335 VEC_INT widx;
336 if(lid == 0)
Giorgio Arena4402cb92018-02-15 13:37:40 +0000337 {
338 // Handle non multiple of 4
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000339 VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
340 widx = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
341 max_val_vec = max(max_val_vec, select(vec_min_val, data_max, CONVERT(widx, VEC_BASE)));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000342 }
343#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
344#endif /* NON_MULTIPLE_OF_GRID_SIZE */
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000345 tmp_local[lid] = CONVERT(max_val_vec, VEC_INT);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000346
347 barrier(CLK_LOCAL_MEM_FENCE);
348
349 if(GRID_SIZE >= 256)
350 {
351 if(lid < 128)
352 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000353 tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000354 }
355 barrier(CLK_LOCAL_MEM_FENCE);
356 }
357 if(GRID_SIZE >= 128)
358 {
359 if(lid < 64)
360 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000361 tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000362 }
363 barrier(CLK_LOCAL_MEM_FENCE);
364 }
365 if(GRID_SIZE >= 64)
366 {
367 if(lid < 32)
368 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000369 tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000370 }
371 barrier(CLK_LOCAL_MEM_FENCE);
372 }
373 if(GRID_SIZE >= 32)
374 {
375 if(lid < 16)
376 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000377 tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000378 }
379 barrier(CLK_LOCAL_MEM_FENCE);
380 }
381 if(GRID_SIZE >= 16)
382 {
383 if(lid < 8)
384 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000385 tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000386 }
387 barrier(CLK_LOCAL_MEM_FENCE);
388 }
389 if(GRID_SIZE >= 8)
390 {
391 if(lid < 4)
392 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000393 tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000394 }
395 barrier(CLK_LOCAL_MEM_FENCE);
396 }
397 if(GRID_SIZE >= 4)
398 {
399 if(lid < 2)
400 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000401 tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000402 }
403 barrier(CLK_LOCAL_MEM_FENCE);
404 }
405 if(lid == 0)
406 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000407 max_val_vec = max(CONVERT((tmp_local[lid + 1]), VEC_BASE), CONVERT((tmp_local[lid]), VEC_BASE));
408 max_local = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000409 }
410 barrier(CLK_LOCAL_MEM_FENCE);
411
412 /* Second section */
413
414 // Set sum vector
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000415 VEC_INT sum1D = 0;
416 int max_val = convert_int(max_local);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000417
418 // Shift values, exp and sum
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000419 for(i = 0; i < width; ++i)
Giorgio Arena4402cb92018-02-15 13:37:40 +0000420 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000421 VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
422 VEC_INT data_fp = CONVERT(data, VEC_INT);
423 VEC_INT data_diff = data_fp - max_val;
424 VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
425 data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
426 data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
427 VSTORE(VECTOR_SIZE)
428 (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
429 sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000430 }
431#ifdef NON_MULTIPLE_OF_GRID_SIZE
432 //TODO: Optimize the calculation (avoid %).
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000433 boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
Giorgio Arena4402cb92018-02-15 13:37:40 +0000434 if(lid < boundary_workitems)
435 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000436 VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
437 VEC_INT data_fp = CONVERT(data, VEC_INT);
438 VEC_INT data_diff = data_fp - max_val;
439 VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
440 data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
441 data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
442 VSTORE(VECTOR_SIZE)
443 (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
444 sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000445 }
446#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000447 if(lid == 0)
Giorgio Arena4402cb92018-02-15 13:37:40 +0000448 {
449 // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000450 VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
451 VEC_INT data_fp = CONVERT(data, VEC_INT);
452 VEC_INT data_diff = data_fp - max_val;
453 VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
454 data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
455 data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
456 VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
457 (data_diff, 0, (__global int *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(int)));
458 data_fp = select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
Sang-Hoon Park77d3d242020-08-10 22:50:17 +0100459 data_fp = select(0, data_fp, widx);
460 sum1D = sum1D + data_fp;
Giorgio Arena4402cb92018-02-15 13:37:40 +0000461 }
462#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
463#endif /* NON_MULTIPLE_OF_GRID_SIZE */
464 tmp_local[lid] = sum1D;
465
466 barrier(CLK_LOCAL_MEM_FENCE);
467
468 if(GRID_SIZE >= 256)
469 {
470 if(lid < 128)
471 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000472 tmp_local[lid] += tmp_local[lid + 128];
Giorgio Arena4402cb92018-02-15 13:37:40 +0000473 }
474 barrier(CLK_LOCAL_MEM_FENCE);
475 }
476 if(GRID_SIZE >= 128)
477 {
478 if(lid < 64)
479 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000480 tmp_local[lid] += tmp_local[lid + 64];
Giorgio Arena4402cb92018-02-15 13:37:40 +0000481 }
482 barrier(CLK_LOCAL_MEM_FENCE);
483 }
484 if(GRID_SIZE >= 64)
485 {
486 if(lid < 32)
487 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000488 tmp_local[lid] += tmp_local[lid + 32];
Giorgio Arena4402cb92018-02-15 13:37:40 +0000489 }
490 barrier(CLK_LOCAL_MEM_FENCE);
491 }
492 if(GRID_SIZE >= 32)
493 {
494 if(lid < 16)
495 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000496 tmp_local[lid] += tmp_local[lid + 16];
Giorgio Arena4402cb92018-02-15 13:37:40 +0000497 }
498 barrier(CLK_LOCAL_MEM_FENCE);
499 }
500 if(GRID_SIZE >= 16)
501 {
502 if(lid < 8)
503 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000504 tmp_local[lid] += tmp_local[lid + 8];
Giorgio Arena4402cb92018-02-15 13:37:40 +0000505 }
506 barrier(CLK_LOCAL_MEM_FENCE);
507 }
508 if(GRID_SIZE >= 8)
509 {
510 if(lid < 4)
511 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000512 tmp_local[lid] += tmp_local[lid + 4];
Giorgio Arena4402cb92018-02-15 13:37:40 +0000513 }
514 barrier(CLK_LOCAL_MEM_FENCE);
515 }
516 if(GRID_SIZE >= 4)
517 {
518 if(lid < 2)
519 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000520 tmp_local[lid] += tmp_local[lid + 2];
Giorgio Arena4402cb92018-02-15 13:37:40 +0000521 }
522 barrier(CLK_LOCAL_MEM_FENCE);
523 }
524 if(lid == 0)
525 {
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000526 sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
527 // Perform sum reduction
528 *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000529 }
530}
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000531#endif // #if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
532#endif /* defined(DATA_TYPE) && defined(DIFF_MIN) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(MIN_VALUE) */