blob: c055381fc5ace5f62ff136332a756146e9bf34ec [file] [log] [blame]
Chunosovf450caa2017-11-08 16:09:35 +07001/*
Giorgio Arena4402cb92018-02-15 13:37:40 +00002 * Copyright (c) 2017-2018 ARM Limited.
Chunosovf450caa2017-11-08 16:09:35 +07003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
Giorgio Arena4402cb92018-02-15 13:37:40 +000024#include "helpers_asymm.h"
Chunosovf450caa2017-11-08 16:09:35 +070025
26#define MAX_OP(x, y, type, size) max((x), (y))
27#define ADD_OP(x, y, type, size) ((x) + (y))
28
Giorgio Arena4402cb92018-02-15 13:37:40 +000029/* Number of workitems in dimension 0. */
30#if !defined(GRID_SIZE)
31#define GRID_SIZE 1
32#endif /* !defined(GRID_SIZE) */
Chunosovf450caa2017-11-08 16:09:35 +070033
Giorgio Arena4402cb92018-02-15 13:37:40 +000034#if VECTOR_SIZE == 2
35__constant uint2 idx__ = (uint2)(0, 1);
36#define asymm_mult(a, b) ASYMM_MULT(a, b, 2)
37#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 2)
38#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 2)
Chunosovf450caa2017-11-08 16:09:35 +070039
Giorgio Arena4402cb92018-02-15 13:37:40 +000040#elif VECTOR_SIZE == 4
41__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
42#define asymm_mult(a, b) ASYMM_MULT(a, b, 4)
43#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 4)
44#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 4)
Chunosovf450caa2017-11-08 16:09:35 +070045
Giorgio Arena4402cb92018-02-15 13:37:40 +000046#elif VECTOR_SIZE == 8
47__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
48#define asymm_mult(a, b) ASYMM_MULT(a, b, 8)
49#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 8)
50#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 8)
Chunosovf450caa2017-11-08 16:09:35 +070051
Giorgio Arena4402cb92018-02-15 13:37:40 +000052#else /* VECTOR_SIZE DEFAULT */
53#define VECTOR_SIZE 16
54#define LOG_VECTOR_SIZE 4
55__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
56#define asymm_mult(a, b) ASYMM_MULT(a, b, 16)
57#define asymm_exp_on_negative_values(a, k_integer_bits) ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, 16)
58#define asymm_rescale(value, src_integer_bits, dst_integer_bits) ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, 16)
Chunosovf450caa2017-11-08 16:09:35 +070059
Giorgio Arena4402cb92018-02-15 13:37:40 +000060#endif /* VECTOR_SIZE END */
Chunosovf450caa2017-11-08 16:09:35 +070061
Giorgio Arena4402cb92018-02-15 13:37:40 +000062#define VEC_UCHAR VEC_DATA_TYPE(uchar, VECTOR_SIZE)
63#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
64#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
Chunosovf450caa2017-11-08 16:09:35 +070065
66#if defined(DIFF_MIN)
67
Giorgio Arena4402cb92018-02-15 13:37:40 +000068VEC_INT mult_by_quantized_multiplier_serial(VEC_INT data)
Chunosovf450caa2017-11-08 16:09:35 +070069{
70#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
71 if(INPUT_BETA_MULTIPLIER > 1)
72 {
73 return asymm_mult(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER);
74 }
75#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
76 return data;
77}
78
Giorgio Arena4402cb92018-02-15 13:37:40 +000079int4 mult_by_quantized_multiplier_parallel(int4 data)
80{
81#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
82 if(INPUT_BETA_MULTIPLIER > 1)
83 {
84 return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 4);
85 }
86#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
87 return data;
88}
89
Chunosovf450caa2017-11-08 16:09:35 +070090/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
91 * then gets the exponent of each element as sums all elements across each row.
92 *
Giorgio Arena4402cb92018-02-15 13:37:40 +000093 * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
Chunosovf450caa2017-11-08 16:09:35 +070094 * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
95 * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
96 *
97 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QASYMM8
98 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
99 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
100 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
101 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
102 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
103 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
104 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
105 * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
106 * @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)
107 * @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
108 * @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)
109 * @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
110 * @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)
111 * @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
112 * @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor
113 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: S32
114 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
115 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
116 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
117 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
118 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
119 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
120 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
121 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr
122 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
123 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
124 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
125 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
126 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
127 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
128 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
129 * @param[in] width Input image width
130 */
Giorgio Arena4402cb92018-02-15 13:37:40 +0000131__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
Chunosovf450caa2017-11-08 16:09:35 +0700132 TENSOR3D_DECLARATION(src),
Giorgio Arena4402cb92018-02-15 13:37:40 +0000133 TENSOR3D_DECLARATION(maxo),
Chunosovf450caa2017-11-08 16:09:35 +0700134 TENSOR3D_DECLARATION(dst),
135 TENSOR3D_DECLARATION(sum),
136 uint width)
137{
Giorgio Arena4402cb92018-02-15 13:37:40 +0000138 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
139 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
140 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
141 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
Chunosovf450caa2017-11-08 16:09:35 +0700142
Giorgio Arena4402cb92018-02-15 13:37:40 +0000143 VEC_UCHAR max_val_vec = 0;
Chunosovf450caa2017-11-08 16:09:35 +0700144
Giorgio Arena4402cb92018-02-15 13:37:40 +0000145 // Calculate max of row
146 const uint width4 = width >> LOG_VECTOR_SIZE;
Chunosovf450caa2017-11-08 16:09:35 +0700147 for(uint i = 0; i < width4; i++)
148 {
Giorgio Arena4402cb92018-02-15 13:37:40 +0000149 VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, i << LOG_VECTOR_SIZE, 0));
150 max_val_vec = MAX_OP(data, max_val_vec, uchar, 16);
Chunosovf450caa2017-11-08 16:09:35 +0700151 }
152
Giorgio Arena4402cb92018-02-15 13:37:40 +0000153#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
Chunosovf450caa2017-11-08 16:09:35 +0700154 // Handle non multiple of 16
Giorgio Arena4402cb92018-02-15 13:37:40 +0000155 VEC_UCHAR uchar_min = (VEC_UCHAR)0;
156 VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));
157 VEC_UCHAR widx = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_UCHAR);
158 max_val_vec = MAX_OP(max_val_vec, select(uchar_min, data, widx), uchar, 16);
159#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
Chunosovf450caa2017-11-08 16:09:35 +0700160
Giorgio Arena4402cb92018-02-15 13:37:40 +0000161 // Perform max reduction
162#if VECTOR_SIZE == 16
163 max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, uchar, 8);
164#endif /* VECTOR SIZE 16 END */
165#if VECTOR_SIZE >= 8
166 max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, uchar, 4);
167#endif /* VECTOR SIZE 8 END */
168#if VECTOR_SIZE >= 4
169 max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, uchar, 2);
170#endif /* VECTOR SIZE 4 END */
171 max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, uchar, 1);
172
173 // Store result
174 *((__global uchar *)maxo.ptr) = max_val_vec.s0;
175
176 // Second part
177
178 // Load max value of 1D logits vector (row)
179 int max_val = convert_int(*((__global uchar *)offset(&maxo, 0, 0)));
180
181 // Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)
182 VEC_INT sum1D = 0;
183
184 // Shift values, exp and sum
185 for(uint i = 0; i < width4; i++)
186 {
187 VEC_UCHAR data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, i << LOG_VECTOR_SIZE, 0));
188 VEC_INT data_fp = CONVERT(data, VEC_INT);
189 VEC_INT data_diff = data_fp - max_val;
190 VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
191 data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
192 data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
193 VSTORE(VECTOR_SIZE)
194 (data_diff, 0, (__global int *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
195 sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
196 }
197
198#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
199 // Handle non multiple of 16
200 data = VLOAD(VECTOR_SIZE)(0, (__global uchar *)offset(&src, width4 << LOG_VECTOR_SIZE, 0));
201 VEC_INT data_fp = CONVERT(data, VEC_INT);
202 VEC_INT data_diff = data_fp - max_val;
203 VEC_INT data_diff_mult = mult_by_quantized_multiplier_serial(data_diff);
204 data_fp = asymm_exp_on_negative_values(data_diff_mult, SCALED_DIFF_INT_BITS);
205 data_fp = asymm_rescale(data_fp, 0, EXP_ACCUMULATION_INT_BITS);
206 VEC_INT widx_ = CONVERT(((VEC_UINT)(width4 << LOG_VECTOR_SIZE) + idx__) < width, VEC_INT);
207 VSTORE(VECTOR_SIZE)
208 (data_diff, 0, (__global int *)offset(&dst, width4 << LOG_VECTOR_SIZE, 0));
209 data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
210 sum1D = sum1D + select(0, data_fp, widx_);
211#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
212
213 // Perform sum reduction
214#if VECTOR_SIZE == 16
215 sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, uchar, 8);
216#endif /* VECTOR SIZE 16 END */
217#if VECTOR_SIZE >= 8
218 sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, uchar, 4);
219#endif /* VECTOR SIZE 8 END */
220#if VECTOR_SIZE >= 4
221 sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, uchar, 2);
222#endif /* VECTOR SIZE 4 END */
223 sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, uchar, 1);
Chunosovf450caa2017-11-08 16:09:35 +0700224
225 // Calculate and store result
226 *((__global int *)sum.ptr) = sum1D.s0;
227}
228
Giorgio Arena4402cb92018-02-15 13:37:40 +0000229/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
230 * then gets the exponent of each element as sums all elements across each row.
231 *
232 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
233 * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
234 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
Giorgio Arena4402cb92018-02-15 13:37:40 +0000235 *
236 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
237 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
238 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
239 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
240 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
241 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
242 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
243 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
244 * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
245 * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)
246 * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
247 * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)
248 * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
249 * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)
250 * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
251 * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
252 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
253 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
254 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
255 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
256 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
257 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
258 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
259 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
260 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
261 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
262 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
263 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
264 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
265 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
266 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
267 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
268 * @param[in] width Input image width
269 */
270__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
271 TENSOR3D_DECLARATION(src),
272 TENSOR3D_DECLARATION(maxo),
273 TENSOR3D_DECLARATION(dst),
274 TENSOR3D_DECLARATION(sum),
275 uint width)
276{
277 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
278 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
279 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
280 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
281
282 const uint4 idx4 = (uint4)(0, 1, 2, 3);
283 const uint lid = get_local_id(0);
284
285 // Define one temporary vector per work-item.
286 __local int4 tmp_local[GRID_SIZE];
287 __local uchar max_local;
288
289 uchar4 uchar_min = (uchar4)0;
Giorgio Arena72f39be2018-02-19 15:33:41 +0000290 uchar4 max_val_vec = uchar_min;
Giorgio Arena4402cb92018-02-15 13:37:40 +0000291
292 // Number of elements per work-item.
293 const uint row = width / GRID_SIZE;
294 // Number of iterations per work-item.
295 const uint width_ = row >> 2;
296 // Calculate max of row
297 uint i = 0;
298 for(; i < width_; i++)
299 {
300 uchar4 data_max = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));
301 max_val_vec = MAX_OP(data_max, max_val_vec, uchar, 4);
302 }
303#ifdef NON_MULTIPLE_OF_GRID_SIZE
304 // How many work-items needed to complete the computation.
305 //TODO: Optimize this calculation (avoid %).
306 int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
307 if(lid < boundary_workitems)
308 {
309 uchar4 data_max = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));
310 max_val_vec = MAX_OP(data_max, max_val_vec, uchar, 4);
311 }
312#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
313 if(boundary_workitems == 0)
314 {
315 boundary_workitems = GRID_SIZE;
316 i--;
317 }
318 if(lid == (boundary_workitems - 1))
319 {
320 // Handle non multiple of 4
321 uchar4 data_max = vload4(0, (__global uchar *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
322 uchar4 widx = convert_uchar4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);
323 max_val_vec = MAX_OP(max_val_vec, select(uchar_min, data_max, widx), uchar, 4);
324 }
325#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
326#endif /* NON_MULTIPLE_OF_GRID_SIZE */
327 tmp_local[lid] = convert_int4(max_val_vec);
328
329 barrier(CLK_LOCAL_MEM_FENCE);
330
331 if(GRID_SIZE >= 256)
332 {
333 if(lid < 128)
334 {
335 tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);
336 }
337 barrier(CLK_LOCAL_MEM_FENCE);
338 }
339 if(GRID_SIZE >= 128)
340 {
341 if(lid < 64)
342 {
343 tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);
344 }
345 barrier(CLK_LOCAL_MEM_FENCE);
346 }
347 if(GRID_SIZE >= 64)
348 {
349 if(lid < 32)
350 {
351 tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);
352 }
353 barrier(CLK_LOCAL_MEM_FENCE);
354 }
355 if(GRID_SIZE >= 32)
356 {
357 if(lid < 16)
358 {
359 tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);
360 }
361 barrier(CLK_LOCAL_MEM_FENCE);
362 }
363 if(GRID_SIZE >= 16)
364 {
365 if(lid < 8)
366 {
367 tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);
368 }
369 barrier(CLK_LOCAL_MEM_FENCE);
370 }
371 if(GRID_SIZE >= 8)
372 {
373 if(lid < 4)
374 {
375 tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);
376 }
377 barrier(CLK_LOCAL_MEM_FENCE);
378 }
379 if(GRID_SIZE >= 4)
380 {
381 if(lid < 2)
382 {
383 tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);
384 }
385 barrier(CLK_LOCAL_MEM_FENCE);
386 }
387 if(lid == 0)
388 {
389 max_val_vec = MAX_OP(convert_uchar4(tmp_local[lid + 1]), convert_uchar4(tmp_local[lid]), uchar, 4);
390 max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, uchar, 2);
391 max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, uchar, 1);
392 max_local = max_val_vec.s0;
393 }
394 barrier(CLK_LOCAL_MEM_FENCE);
395
396 /* Second section */
397
398 // Set sum vector
399 int4 sum1D = 0;
400 int max_val = convert_int(max_local);
401
402 // Shift values, exp and sum
403 for(i = 0; i < width_; i++)
404 {
405 uchar4 data = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));
406 int4 data_fp = convert_int4(data);
407 int4 data_diff = data_fp - max_val;
408 int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
409 data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
410 data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
411 vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
412 sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
413 }
414#ifdef NON_MULTIPLE_OF_GRID_SIZE
415 //TODO: Optimize the calculation (avoid %).
416 boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
417 if(lid < boundary_workitems)
418 {
419 uchar4 data = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4, 0));
420 int4 data_fp = convert_int4(data);
421 int4 data_diff = data_fp - max_val;
422 int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
423 data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
424 data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
425 vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4, 0));
426 sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
427 }
428#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
429 if(boundary_workitems == 0)
430 {
431 boundary_workitems = GRID_SIZE;
432 i--;
433 }
434 if(lid == (boundary_workitems - 1))
435 {
436 // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
437 uchar4 data = vload4(0, (__global uchar *)offset(&src, i * GRID_SIZE * 4 + 4, 0));
438 int4 data_fp = convert_int4(data);
439 int4 data_diff = data_fp - max_val;
440 int4 data_diff_mult = mult_by_quantized_multiplier_parallel(data_diff);
441 data_fp = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 4);
442 data_fp = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, 4);
Giorgio Arena72f39be2018-02-19 15:33:41 +0000443 int4 widx = convert_int4(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width);
444 data_fp = select(0, data_fp, widx);
Giorgio Arena4402cb92018-02-15 13:37:40 +0000445 vstore4(data_diff, 0, (__global int *)offset(&dst, i * GRID_SIZE * 4 + 4, 0));
446 sum1D = sum1D + select(0, data_fp, data_diff >= (int4)(DIFF_MIN));
447 }
448#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
449#endif /* NON_MULTIPLE_OF_GRID_SIZE */
450 tmp_local[lid] = sum1D;
451
452 barrier(CLK_LOCAL_MEM_FENCE);
453
454 if(GRID_SIZE >= 256)
455 {
456 if(lid < 128)
457 {
458 tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], int, 4);
459 }
460 barrier(CLK_LOCAL_MEM_FENCE);
461 }
462 if(GRID_SIZE >= 128)
463 {
464 if(lid < 64)
465 {
466 tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], int, 4);
467 }
468 barrier(CLK_LOCAL_MEM_FENCE);
469 }
470 if(GRID_SIZE >= 64)
471 {
472 if(lid < 32)
473 {
474 tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], int, 4);
475 }
476 barrier(CLK_LOCAL_MEM_FENCE);
477 }
478 if(GRID_SIZE >= 32)
479 {
480 if(lid < 16)
481 {
482 tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], int, 4);
483 }
484 barrier(CLK_LOCAL_MEM_FENCE);
485 }
486 if(GRID_SIZE >= 16)
487 {
488 if(lid < 8)
489 {
490 tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], int, 4);
491 }
492 barrier(CLK_LOCAL_MEM_FENCE);
493 }
494 if(GRID_SIZE >= 8)
495 {
496 if(lid < 4)
497 {
498 tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], int, 4);
499 }
500 barrier(CLK_LOCAL_MEM_FENCE);
501 }
502 if(GRID_SIZE >= 4)
503 {
504 if(lid < 2)
505 {
506 tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], int, 4);
507 }
508 barrier(CLK_LOCAL_MEM_FENCE);
509 }
510 if(lid == 0)
511 {
512 sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], int, 4);
513 // Perform max reduction
514 sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, int, 2);
515 sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, int, 1);
516 *((__global int *)sum.ptr) = sum1D.s0;
517 }
518}
519
Chunosovf450caa2017-11-08 16:09:35 +0700520/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
521 *
522 * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
523 * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
524 * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
525 *
526 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: S32
527 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
528 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
529 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
530 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
531 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
532 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
533 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
534 * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
535 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
536 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
537 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
538 * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
539 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
540 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
541 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
542 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: QASYMM8
543 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
544 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
545 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
546 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
547 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
548 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
549 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
550 */
551__kernel void softmax_layer_norm_quantized(
552 TENSOR3D_DECLARATION(src),
553 TENSOR3D_DECLARATION(sum),
554 TENSOR3D_DECLARATION(dst))
555{
556 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
557 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
558 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
559
560 // Load max value of 1D logits vector (row)
561 int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
562
563 // It will be better to calculate this in prev layer and pass here as parameter
564 uint sum_val_u = convert_uint(sum_val);
565 int headroom_plus_one = clz(sum_val_u);
566 int num_bits_over_unit = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
567 int shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
568 int16 shifted_sum_minus_one = shifted_sum_minus_one_1;
Giorgio Arena4402cb92018-02-15 13:37:40 +0000569 int16 shifted_scale = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, 16);
Chunosovf450caa2017-11-08 16:09:35 +0700570
571 // It was already calculated in prev layer, should be stored into tmp output and reused
572 int16 data_diff = vload16(0, (__global int *)offset(&src, 0, 0));
Giorgio Arena4402cb92018-02-15 13:37:40 +0000573 int16 data_diff_mult = data_diff;
574#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
575 if(INPUT_BETA_MULTIPLIER > 1)
576 {
577 data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, 16);
578 }
579#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
580 int16 data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, 16);
Chunosovf450caa2017-11-08 16:09:35 +0700581
Giorgio Arena4402cb92018-02-15 13:37:40 +0000582 data = ASYMM_MULT(shifted_scale, data, 16);
583 data = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, 16);
Georgios Pinitas47b56032017-11-29 16:09:48 +0000584 data = select(0, data, data_diff >= (int16)(DIFF_MIN));
585 vstore16(convert_uchar16_sat(data), 0, (__global uchar *)offset(&dst, 0, 0));
Chunosovf450caa2017-11-08 16:09:35 +0700586}
587
588#endif /* defined(DIFF_MIN) */