blob: ce3bd7bc43d1a758a0b0fd1b1f97ade85360deba [file] [log] [blame]
Chunosovf450caa2017-11-08 16:09:35 +07001/*
Sang-Hoon Park62eeb532019-10-29 13:13:19 +00002 * Copyright (c) 2017-2019 ARM Limited.
Chunosovf450caa2017-11-08 16:09:35 +07003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Giorgio Arena4402cb92018-02-15 13:37:40 +000024#include "helpers_asymm.h"
Chunosovf450caa2017-11-08 16:09:35 +070025
26#define MAX_OP(x, y, type, size) max((x), (y))
27#define ADD_OP(x, y, type, size) ((x) + (y))
Sang-Hoon Park62eeb532019-10-29 13:13:19 +000028#define SUB_OP(x, y, type, size) ((x) - (y))
Chunosovf450caa2017-11-08 16:09:35 +070029
Giorgio Arena4402cb92018-02-15 13:37:40 +000030/* Number of workitems in dimension 0. */
31#if !defined(GRID_SIZE)
32#define GRID_SIZE 1
33#endif /* !defined(GRID_SIZE) */
Chunosovf450caa2017-11-08 16:09:35 +070034
Giorgio Arena4402cb92018-02-15 13:37:40 +000035#if VECTOR_SIZE == 2
36__constant uint2 idx__ = (uint2)(0, 1);
37#define asymm_mult(a, b) ASYMM_MULT(a, b, 2)
38#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 2)
39#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 2)
Chunosovf450caa2017-11-08 16:09:35 +070040
Giorgio Arena4402cb92018-02-15 13:37:40 +000041#elif VECTOR_SIZE == 4
42__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
43#define asymm_mult(a, b) ASYMM_MULT(a, b, 4)
44#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 4)
45#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 4)
Chunosovf450caa2017-11-08 16:09:35 +070046
Giorgio Arena4402cb92018-02-15 13:37:40 +000047#elif VECTOR_SIZE == 8
48__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
49#define asymm_mult(a, b) ASYMM_MULT(a, b, 8)
50#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 8)
51#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 8)
Chunosovf450caa2017-11-08 16:09:35 +070052
Giorgio Arena4402cb92018-02-15 13:37:40 +000053#else /* VECTOR_SIZE DEFAULT */
54#define VECTOR_SIZE 16
55#define LOG_VECTOR_SIZE 4
56__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
57#define asymm_mult(a, b) ASYMM_MULT(a, b, 16)
58#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 16)
59#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 16)
Chunosovf450caa2017-11-08 16:09:35 +070060
Giorgio Arena4402cb92018-02-15 13:37:40 +000061#endif /* VECTOR_SIZE END */
Chunosovf450caa2017-11-08 16:09:35 +070062
Giorgio Arena4402cb92018-02-15 13:37:40 +000063#define VEC_UCHAR VEC_DATA_TYPE(uchar, VECTOR_SIZE)
64#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
65#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
Chunosovf450caa2017-11-08 16:09:35 +070066
67#if defined(DIFF_MIN)
68
Giorgio Arena4402cb92018-02-15 13:37:40 +000069VEC_INT mult_by_quantized_multiplier_serial(VEC_INT data)
Chunosovf450caa2017-11-08 16:09:35 +070070{
71#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
72 if(INPUT_BETA_MULTIPLIER > 1)
73 {
74 return asymm_mult(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER);
75 }
76#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
77 return data;
78}
79
Giorgio Arena4402cb92018-02-15 13:37:40 +000080int4 mult_by_quantized_multiplier_parallel(int4 data)
81{
82#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
83 if(INPUT_BETA_MULTIPLIER > 1)
84 {
85 return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 4);
86 }
87#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
88 return data;
89}
90
Chunosovf450caa2017-11-08 16:09:35 +070091/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
92 * then gets the exponent of each element as sums all elements across each row.
93 *
Giorgio Arena4402cb92018-02-15 13:37:40 +000094 * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
Chunosovf450caa2017-11-08 16:09:35 +070095 * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
96 * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
97 *
98 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QASYMM8
99 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
100 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
101 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
102 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
103 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
104 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
105 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
106 * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
107 * @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)
108 * @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
109 * @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)
110 * @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
111 * @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)
112 * @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
113 * @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor
114 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: S32
115 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
116 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
117 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
118 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
119 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
120 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
121 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
122 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr
123 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
124 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
125 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
126 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
127 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
128 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
129 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
130 * @param[in] width Input image width
131 */
Giorgio Arena4402cb92018-02-15 13:37:40 +0000132__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
Chunosovf450caa2017-11-08 16:09:35 +0700133 TENSOR3D_DECLARATION(src),
Giorgio Arena4402cb92018-02-15 13:37:40 +0000134 TENSOR3D_DECLARATION(maxo),
Chunosovf450caa2017-11-08 16:09:35 +0700135 TENSOR3D_DECLARATION(dst),
136 TENSOR3D_DECLARATION(sum),
137 uint width)
138{
Giorgio Arena4402cb92018-02-15 13:37:40 +0000139 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
140 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
141 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
142 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
Chunosovf450caa2017-11-08 16:09:35 +0700143
Giorgio Arena4402cb92018-02-15 13:37:40 +0000144 VEC_UCHAR max_val_vec = 0;
Chunosovf450caa2017-11-08 16:09:35 +0700145
Giorgio Arena4402cb92018-02-15 13:37:40 +0000146 // Calculate max of row
147 const uint width4 = width >> LOG_VECTOR_SIZE;
Chunosovf450caa2017-11-08 16:09:35 +0700148 for(uint i = 0; i < width4; i++)
149 {
Giorgio Arena4402cb92018-02-15 13:37:40 +0000150 VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, i << LOG_VECTOR_SIZE, 0));
151 max_val_vec = MAX_OP(data, max_val_vec, uchar, 16);
Chunosovf450caa2017-11-08 16:09:35 +0700152 }
153
Giorgio Arena4402cb92018-02-15 13:37:40 +0000154#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
Chunosovf450caa2017-11-08 16:09:35 +0700155 // Handle non multiple of 16
Giorgio Arena4402cb92018-02-15 13:37:40 +0000156 VEC_UCHAR uchar_min = (VEC_UCHAR)0;
157 VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));
158 VEC_UCHAR widx = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_UCHAR);
159 max_val_vec = MAX_OP(max_val_vec, select(uchar_min, data, widx), uchar, 16);
160#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
Chunosovf450caa2017-11-08 16:09:35 +0700161
Giorgio Arena4402cb92018-02-15 13:37:40 +0000162 // Perform max reduction
163#if VECTOR_SIZE == 16
164 max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, uchar, 8);
165#endif /* VECTOR SIZE 16 END */
166#if VECTOR_SIZE >= 8
167 max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, uchar, 4);
168#endif /* VECTOR SIZE 8 END */
169#if VECTOR_SIZE >= 4
170 max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, uchar, 2);
171#endif /* VECTOR SIZE 4 END */
172 max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, uchar, 1);
173
174 // Store result
175 *((__global uchar *)maxo.ptr) = max_val_vec.s0;
176
177 // Second part
178
179 // Load max value of 1D logits vector (row)
180 int max_val = convert_int(*((__global uchar *)offset(&maxo, 0, 0)));
181
182 // Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)
183 VEC_INT sum1D = 0;
184
185 // Shift values, exp and sum
186 for(uint i = 0; i < width4; i++)
187 {
188 VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, i << LOG_VECTOR_SIZE, 0));
189 VEC_INT data_fp = CONVERT(data, VEC_INT);
190 VEC_INT data_diff = data_fp - max_val;
191 VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
192 data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
193 data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
194 VSTORE(VECTOR_SIZE)
195 (data_diff, 0, (__global int *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
196 sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
197 }
198
199#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
200 // Handle non multiple of 16
201 data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));
202 VEC_INT data_fp = CONVERT(data, VEC_INT);
203 VEC_INT data_diff = data_fp - max_val;
204 VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
205 data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
206 data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
207 VEC_INT widx_ = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_INT);
208 VSTORE(VECTOR_SIZE)
209 (data_diff, 0, (__global int *)offset(&dst, width4 << LOG_VECTOR_SIZE, 0));
210 data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
211 sum1D = sum1D + select(0, data_fp, widx_);
212#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
213
214 // Perform sum reduction
215#if VECTOR_SIZE == 16
216 sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, uchar, 8);
217#endif /* VECTOR SIZE 16 END */
218#if VECTOR_SIZE >= 8
219 sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, uchar, 4);
220#endif /* VECTOR SIZE 8 END */
221#if VECTOR_SIZE >= 4
222 sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, uchar, 2);
223#endif /* VECTOR SIZE 4 END */
224 sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, uchar, 1);
Chunosovf450caa2017-11-08 16:09:35 +0700225
226 // Calculate and store result
227 *((__global int *)sum.ptr) = sum1D.s0;
228}
229
Giorgio Arena4402cb92018-02-15 13:37:40 +0000230/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
231 * then gets the exponent of each element as sums all elements across each row.
232 *
233 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
Giorgio Arena4402cb92018-02-15 13:37:40 +0000234 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
Giorgio Arena4402cb92018-02-15 13:37:40 +0000235 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100236 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
Giorgio Arena4402cb92018-02-15 13:37:40 +0000237 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
238 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
239 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
240 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
241 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
242 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
243 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
244 * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
245 * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)
246 * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
247 * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)
248 * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
249 * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)
250 * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
251 * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
252 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
253 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
254 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
255 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
256 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
257 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
258 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
259 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
260 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
261 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
262 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
263 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
264 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
265 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
266 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
267 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
268 * @param[in] width Input image width
269 */
270__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
271 TENSOR3D_DECLARATION(src),
272 TENSOR3D_DECLARATION(maxo),
273 TENSOR3D_DECLARATION(dst),
274 TENSOR3D_DECLARATION(sum),
275 uint width)
276{
277 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
278 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
279 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
280 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
281
282 const uint4 idx4 = (uint4)(0, 1, 2, 3);
283 const uint lid = get_local_id(0);
284
285 // Define one temporary vector per work-item.
286 __local int4 tmp_local[GRID_SIZE];
287 __local uchar max_local;
288
289 uchar4 uchar_min = (uchar4)0;
Giorgio Arena72f39be2018-02-19 15:33:41 +0000290 uchar4 max_val_vec = uchar_min;
Giorgio Arena4402cb92018-02-15 13:37:40 +0000291
292 // Number of elements per work-item.
293 const uint row = width / GRID_SIZE;
294 // Number of iterations per work-item.
295 const uint width_ = row >> 2;
296 // Calculate max of row
297 uint i = 0;
298 for(; i < width_; i++)
299 {
300 uchar4 data_max = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));
301 max_val_vec = MAX_OP(data_max, max_val_vec, uchar, 4);
302 }
303#ifdef NON_MULTIPLE_OF_GRID_SIZE
304 // How many work-items needed to complete the computation.
305 //TODO: Optimize this calculation (avoid %).
306 int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
307 if(lid < boundary_workitems)
308 {
309 uchar4 data_max = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));
310 max_val_vec = MAX_OP(data_max, max_val_vec, uchar, 4);
311 }
312#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
313 if(boundary_workitems == 0)
314 {
315 boundary_workitems = GRID_SIZE;
316 i--;
317 }
318 if(lid == (boundary_workitems - 1))
319 {
320 // Handle non multiple of 4
321 uchar4 data_max = vload4(0, (__global uchar *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
322 uchar4 widx = convert_uchar4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);
323 max_val_vec = MAX_OP(max_val_vec, select(uchar_min, data_max, widx), uchar, 4);
324 }
325#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
326#endif /* NON_MULTIPLE_OF_GRID_SIZE */
327 tmp_local[lid] = convert_int4(max_val_vec);
328
329 barrier(CLK_LOCAL_MEM_FENCE);
330
331 if(GRID_SIZE >= 256)
332 {
333 if(lid < 128)
334 {
335 tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);
336 }
337 barrier(CLK_LOCAL_MEM_FENCE);
338 }
339 if(GRID_SIZE >= 128)
340 {
341 if(lid < 64)
342 {
343 tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);
344 }
345 barrier(CLK_LOCAL_MEM_FENCE);
346 }
347 if(GRID_SIZE >= 64)
348 {
349 if(lid < 32)
350 {
351 tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);
352 }
353 barrier(CLK_LOCAL_MEM_FENCE);
354 }
355 if(GRID_SIZE >= 32)
356 {
357 if(lid < 16)
358 {
359 tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);
360 }
361 barrier(CLK_LOCAL_MEM_FENCE);
362 }
363 if(GRID_SIZE >= 16)
364 {
365 if(lid < 8)
366 {
367 tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);
368 }
369 barrier(CLK_LOCAL_MEM_FENCE);
370 }
371 if(GRID_SIZE >= 8)
372 {
373 if(lid < 4)
374 {
375 tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);
376 }
377 barrier(CLK_LOCAL_MEM_FENCE);
378 }
379 if(GRID_SIZE >= 4)
380 {
381 if(lid < 2)
382 {
383 tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);
384 }
385 barrier(CLK_LOCAL_MEM_FENCE);
386 }
387 if(lid == 0)
388 {
389 max_val_vec = MAX_OP(convert_uchar4(tmp_local[lid + 1]), convert_uchar4(tmp_local[lid]), uchar, 4);
390 max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, uchar, 2);
391 max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, uchar, 1);
392 max_local = max_val_vec.s0;
393 }
394 barrier(CLK_LOCAL_MEM_FENCE);
395
396 /* Second section */
397
398 // Set sum vector
399 int4 sum1D = 0;
400 int max_val = convert_int(max_local);
401
402 // Shift values, exp and sum
403 for(i = 0; i < width_; i++)
404 {
405 uchar4 data = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));
406 int4 data_fp = convert_int4(data);
407 int4 data_diff = data_fp - max_val;
408 int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
409 data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
410 data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
411 vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
412 sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
413 }
414#ifdef NON_MULTIPLE_OF_GRID_SIZE
415 //TODO: Optimize the calculation (avoid %).
416 boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
417 if(lid < boundary_workitems)
418 {
419 uchar4 data = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));
420 int4 data_fp = convert_int4(data);
421 int4 data_diff = data_fp - max_val;
422 int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
423 data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
424 data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
425 vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
426 sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
427 }
428#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
429 if(boundary_workitems == 0)
430 {
431 boundary_workitems = GRID_SIZE;
432 i--;
433 }
434 if(lid == (boundary_workitems - 1))
435 {
436 // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
437 uchar4 data = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4 + 4, 0));
438 int4 data_fp = convert_int4(data);
439 int4 data_diff = data_fp - max_val;
440 int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
441 data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
442 data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
Giorgio Arena72f39be2018-02-19 15:33:41 +0000443 int4 widx = convert_int4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);
444 data_fp = select(0, data_fp, widx);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000445 vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4 + 4, 0));
446 sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
447 }
448#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
449#endif /* NON_MULTIPLE_OF_GRID_SIZE */
450 tmp_local[lid] = sum1D;
451
452 barrier(CLK_LOCAL_MEM_FENCE);
453
454 if(GRID_SIZE >= 256)
455 {
456 if(lid < 128)
457 {
458 tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);
459 }
460 barrier(CLK_LOCAL_MEM_FENCE);
461 }
462 if(GRID_SIZE >= 128)
463 {
464 if(lid < 64)
465 {
466 tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);
467 }
468 barrier(CLK_LOCAL_MEM_FENCE);
469 }
470 if(GRID_SIZE >= 64)
471 {
472 if(lid < 32)
473 {
474 tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);
475 }
476 barrier(CLK_LOCAL_MEM_FENCE);
477 }
478 if(GRID_SIZE >= 32)
479 {
480 if(lid < 16)
481 {
482 tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);
483 }
484 barrier(CLK_LOCAL_MEM_FENCE);
485 }
486 if(GRID_SIZE >= 16)
487 {
488 if(lid < 8)
489 {
490 tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);
491 }
492 barrier(CLK_LOCAL_MEM_FENCE);
493 }
494 if(GRID_SIZE >= 8)
495 {
496 if(lid < 4)
497 {
498 tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);
499 }
500 barrier(CLK_LOCAL_MEM_FENCE);
501 }
502 if(GRID_SIZE >= 4)
503 {
504 if(lid < 2)
505 {
506 tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);
507 }
508 barrier(CLK_LOCAL_MEM_FENCE);
509 }
510 if(lid == 0)
511 {
512 sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], int, 4);
513 // Perform max reduction
514 sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, int, 2);
515 sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, int, 1);
516 *((__global int *)sum.ptr) = sum1D.s0;
517 }
518}
519
Chunosovf450caa2017-11-08 16:09:35 +0700520/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
521 *
Chunosovf450caa2017-11-08 16:09:35 +0700522 * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
523 * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
524 *
525 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: S32
526 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
527 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
528 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
529 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
530 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
531 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
532 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
533 * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
534 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
535 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
536 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
537 * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
538 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
539 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
540 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
541 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: QASYMM8
542 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
543 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
544 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
545 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
546 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
547 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
548 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
549 */
550__kernel void softmax_layer_norm_quantized(
551 TENSOR3D_DECLARATION(src),
552 TENSOR3D_DECLARATION(sum),
553 TENSOR3D_DECLARATION(dst))
554{
555 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
556 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
557 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
558
559 // Load max value of 1D logits vector (row)
560 int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
561
562 // It will be better to calculate this in prev layer and pass here as parameter
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000563#ifndef LOG_SOFTMAX
Giorgio Arenaaf724e72019-11-12 14:45:14 +0000564 uint sum_val_u = convert_uint(sum_val);
Chunosovf450caa2017-11-08 16:09:35 +0700565 int headroom_plus_one = clz(sum_val_u);
566 int num_bits_over_unit = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
567 int shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
568 int16 shifted_sum_minus_one = shifted_sum_minus_one_1;
Giorgio Arena4402cb92018-02-15 13:37:40 +0000569 int16 shifted_scale = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, 16);
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000570#endif /* LOG_SOFTMAX */
Chunosovf450caa2017-11-08 16:09:35 +0700571
572 // It was already calculated in prev layer, should be stored into tmp output and reused
573 int16 data_diff = vload16(0, (__global int *)offset(&src, 0, 0));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000574 int16 data_diff_mult = data_diff;
575#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
576 if(INPUT_BETA_MULTIPLIER > 1)
577 {
578 data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 16);
579 }
580#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
Chunosovf450caa2017-11-08 16:09:35 +0700581
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000582#ifdef LOG_SOFTMAX
Giorgio Arenaaf724e72019-11-12 14:45:14 +0000583 long16 data = SUB_OP(convert_long16(data_diff_mult), (long16)(sum_val), long, 16);
584 data = select(0L, data, convert_long16(data_diff) >= (long16)(DIFF_MIN));
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000585#else /* LOG_SOFTMAX */
Giorgio Arenaaf724e72019-11-12 14:45:14 +0000586 int16 data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 16);
587 data = ASYMM_MULT(shifted_scale, data, 16);
588 data = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, 16);
589 data = select(0, data, data_diff >= (int16)(DIFF_MIN));
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000590#endif /* LOG_SOFTMAX */
Georgios Pinitas47b56032017-11-29 16:09:48 +0000591 vstore16(convert_uchar16_sat(data), 0, (__global uchar *)offset(&dst, 0, 0));
Chunosovf450caa2017-11-08 16:09:35 +0700592}
593
594#endif /* defined(DIFF_MIN) */