blob: 77dbb47e4162ac2726d11c3d58c5e21c87a635da [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010026#define MAX_OP(x, y, type, size) max((x), (y))
27#define ADD_OP(x, y, type, size) ((x) + (y))
28#define SUB_OP(x, y, type, size) ((x) - (y))
Pablo Palmier48a60f92017-10-18 11:03:08 +010029#define MUL_OP(x, y, type, size) ((x) * (y))
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010030#define DIV_OP(x, y, type, size) ((x) / (y))
31#define EXP_OP(x, type, size) exp((x))
32
Anthony Barbierac69aa12017-07-03 17:39:37 +010033#ifdef USE_F16
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010034#define MINVAL -HALF_MAX
35#define SELECT_DATA_TYPE short
Anthony Barbierac69aa12017-07-03 17:39:37 +010036#else /* USE_F16 */
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010037#define MINVAL -FLT_MAX
Anthony Barbier6ff3b192017-09-04 18:44:23 +010038#define SELECT_DATA_TYPE int
Anthony Barbierac69aa12017-07-03 17:39:37 +010039#endif /* USE_F16 */
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010040
Chunosovd6afedc2017-11-06 22:09:45 +070041/* Number of workitems in dimension 0. */
42#if !defined(GRID_SIZE)
43#define GRID_SIZE 1
44#endif /* !defined(GRID_SIZE) */
45
46/* Vector size, i.e. number of vector elements. */
47#if VECTOR_SIZE == 2
48__constant VEC_DATA_TYPE(DATA_TYPE, 2) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 2))(MINVAL);
49__constant uint2 idx__ = (uint2)(0, 1);
50
51#elif VECTOR_SIZE == 4
52__constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
53__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
54
55#elif VECTOR_SIZE == 8
56__constant VEC_DATA_TYPE(DATA_TYPE, 8) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 8))(MINVAL);
57__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
58
59#else /* VECTOR_SIZE DEFAULT */
60#define VECTOR_SIZE 16
61#define LOG_VECTOR_SIZE 4
62__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
63__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
64
65#endif /* VECTOR_SIZE END */
66
67// TODO (COMPMID-661): Remove if the non-fused kernels are removed
Anthony Barbier6ff3b192017-09-04 18:44:23 +010068__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
69__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
Chunosovd6afedc2017-11-06 22:09:45 +070070__constant uint4 idx4 = (uint4)(0, 1, 2, 3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010071
Anthony Barbier6ff3b192017-09-04 18:44:23 +010072/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
73 *
74 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
75 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +010076 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
Anthony Barbier6ff3b192017-09-04 18:44:23 +010077 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
78 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
79 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
80 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
steniu010d523cc2017-07-13 14:24:23 +010081 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
82 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010083 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010084 * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +010085 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
86 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
87 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
88 * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
steniu010d523cc2017-07-13 14:24:23 +010089 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
90 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010091 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010092 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +010093 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
94 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
95 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
96 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
steniu010d523cc2017-07-13 14:24:23 +010097 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
98 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010099 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
100 */
101__kernel void softmax_layer_norm(
steniu010d523cc2017-07-13 14:24:23 +0100102 TENSOR3D_DECLARATION(src),
103 TENSOR3D_DECLARATION(sum),
104 TENSOR3D_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100105{
steniu010d523cc2017-07-13 14:24:23 +0100106 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
107 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
108 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100109
110 // Load max value of 1D logits vector (row)
111 DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
112 VEC_DATA_TYPE(DATA_TYPE, 16)
113 data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000114#ifdef LOG_SOFTMAX
Sang-Hoon Parka0205b92020-07-07 09:36:09 +0100115 sum_val = log(sum_val);
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000116 vstore16(SUB_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
117#else /* LOG_SOFTMAX */
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100118 vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000119#endif /* LOG_SOFTMAX */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100120}
Chunosovd6afedc2017-11-06 22:09:45 +0700121
122/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
123 * then gets the exponent of each element as sums all elements across each row.
124 *
125 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
Chunosovd6afedc2017-11-06 22:09:45 +0700126 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
127 * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
128 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100129 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
Chunosovd6afedc2017-11-06 22:09:45 +0700130 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
131 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
132 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
133 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
134 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
135 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
136 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
137 * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
138 * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)
139 * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
140 * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)
141 * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
142 * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)
143 * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
144 * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
145 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
146 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
147 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
148 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
149 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
150 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
151 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
152 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
153 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
154 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
155 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
156 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
157 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
158 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
159 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
160 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
161 * @param[in] width Input image width
162 */
163__kernel void softmax_layer_max_shift_exp_sum_serial(
164 TENSOR3D_DECLARATION(src),
165 TENSOR3D_DECLARATION(maxo),
166 TENSOR3D_DECLARATION(dst),
167 TENSOR3D_DECLARATION(sum),
168 uint width)
169{
170 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
171 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
172 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
173 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
174
175#ifdef BETA
176 // Initialize beta
177 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
Georgios Pinitas4df76c92017-11-10 10:26:11 +0000178 beta = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))BETA;
Chunosovd6afedc2017-11-06 22:09:45 +0700179#endif /* BETA */
180
181 // Initialize local maximum
182 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
183 max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))type_min_;
184
185 // Calculate max of row
186 const uint width_ = width >> LOG_VECTOR_SIZE;
187 for(uint i = 0; i < width_; i++)
188 {
189 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
190 data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
191 max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, VECTOR_SIZE);
192 }
193
194#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
195 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
196 data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
197 VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)
198 widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
199 max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);
200#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
201
202 // Perform max reduction
203#if VECTOR_SIZE == 16
204 max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
205#endif /* VECTOR SIZE 16 END */
206#if VECTOR_SIZE >= 8
207 max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
208#endif /* VECTOR SIZE 8 END */
209#if VECTOR_SIZE >= 4
210 max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
211#endif /* VECTOR SIZE 4 END */
212 max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
213 // Store result
214 *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
215
216 /* Second section */
217
218 // Load max value of 1D logits vector (row)
219 DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));
220
221 // Set sum vector
222 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
223 sum1D = 0;
224
225 // Shift values, exp and sum
226 for(uint i = 0; i < width_; i++)
227 {
228 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
229 data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
230 data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
231#ifdef BETA
232 data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
233#endif /* BETA */
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000234#ifdef LOG_SOFTMAX
235 VSTORE(VECTOR_SIZE)
236 (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
237 data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
238#else /* LOG_SOFTMAX */
Chunosovd6afedc2017-11-06 22:09:45 +0700239 data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
240 VSTORE(VECTOR_SIZE)
241 (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000242#endif /* LOG_SOFTMAX */
Chunosovd6afedc2017-11-06 22:09:45 +0700243 sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
244 }
245
246#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
247 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
248 data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
249 data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
250#ifdef BETA
251 data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
252#endif /* BETA */
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000253#ifdef LOG_SOFTMAX
254 VSTORE(VECTOR_SIZE)
255 (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
256 data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
257 widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
258 data = select(0, data, widx);
259#else /* LOG_SOFTMAX */
Chunosovd6afedc2017-11-06 22:09:45 +0700260 data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
261 widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
262 data = select(0, data, widx);
263 VSTORE(VECTOR_SIZE)
264 (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000265#endif /* LOG_SOFTMAX */
Chunosovd6afedc2017-11-06 22:09:45 +0700266 sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
267#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
268
269 // Perform sum reduction
270#if VECTOR_SIZE == 16
271 sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
272#endif /* VECTOR SIZE 16 END */
273#if VECTOR_SIZE >= 8
274 sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
275#endif /* VECTOR SIZE 8 END */
276#if VECTOR_SIZE >= 4
277 sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
278#endif /* VECTOR SIZE 4 END */
279 sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
280
281 // Calculate and store result
282 *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
283}
284
285/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
286 * then gets the exponent of each element as sums all elements across each row.
287 *
288 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
Chunosovd6afedc2017-11-06 22:09:45 +0700289 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
290 * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
291 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100292 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
Chunosovd6afedc2017-11-06 22:09:45 +0700293 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
294 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
295 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
296 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
297 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
298 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
299 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
300 * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
301 * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)
302 * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
303 * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)
304 * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
305 * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)
306 * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
307 * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
308 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
309 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
310 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
311 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
312 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
313 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
314 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
315 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
316 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
317 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
318 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
319 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
320 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
321 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
322 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
323 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
324 * @param[in] width Input image width
325 */
326__kernel void softmax_layer_max_shift_exp_sum_parallel(
327 TENSOR3D_DECLARATION(src),
328 TENSOR3D_DECLARATION(maxo),
329 TENSOR3D_DECLARATION(dst),
330 TENSOR3D_DECLARATION(sum),
331 uint width)
332{
333 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
334 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
335 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
336 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
337
338 const uint lid = get_local_id(0);
339
340#ifdef BETA
341 // Initialize beta
342 VEC_DATA_TYPE(DATA_TYPE, 4)
343 beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;
344#endif /* BETA */
345
346 // Define one temporary vector per work-item.
347 __local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];
348 __local DATA_TYPE max_local;
349
350 __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
351 VEC_DATA_TYPE(DATA_TYPE, 4)
352 max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;
353 // Number of elements per work-item.
354 const uint row = width / GRID_SIZE;
355 // Number of iterations per work-item.
356 const uint width_ = row >> 2;
357 // Calculate max of row
358 uint i = 0;
359 for(; i < width_; i++)
360 {
361 VEC_DATA_TYPE(DATA_TYPE, 4)
362 data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
363 max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
364 }
365#ifdef NON_MULTIPLE_OF_GRID_SIZE
366 // How many work-items needed to complete the computation.
367 //TODO: Optimize this calculation (avoid %).
368 int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
369 if(lid < boundary_workitems)
370 {
371 VEC_DATA_TYPE(DATA_TYPE, 4)
372 data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
373 max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
374 }
375#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
376 if(boundary_workitems == 0)
377 {
378 boundary_workitems = GRID_SIZE;
379 i--;
380 }
381 if(lid == (boundary_workitems - 1))
382 {
383 // Handle non multiple of 4
384 VEC_DATA_TYPE(DATA_TYPE, 4)
385 data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
386 VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
387 widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
388 max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);
389 }
390#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
391#endif /* NON_MULTIPLE_OF_GRID_SIZE */
392 tmp_local[lid] = max_val_vec;
393
394 barrier(CLK_LOCAL_MEM_FENCE);
395
396 if(GRID_SIZE >= 256)
397 {
398 if(lid < 128)
399 {
400 tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
401 }
402 barrier(CLK_LOCAL_MEM_FENCE);
403 }
404 if(GRID_SIZE >= 128)
405 {
406 if(lid < 64)
407 {
408 tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
409 }
410 barrier(CLK_LOCAL_MEM_FENCE);
411 }
412 if(GRID_SIZE >= 64)
413 {
414 if(lid < 32)
415 {
416 tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
417 }
418 barrier(CLK_LOCAL_MEM_FENCE);
419 }
420 if(GRID_SIZE >= 32)
421 {
422 if(lid < 16)
423 {
424 tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
425 }
426 barrier(CLK_LOCAL_MEM_FENCE);
427 }
428 if(GRID_SIZE >= 16)
429 {
430 if(lid < 8)
431 {
432 tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
433 }
434 barrier(CLK_LOCAL_MEM_FENCE);
435 }
436 if(GRID_SIZE >= 8)
437 {
438 if(lid < 4)
439 {
440 tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
441 }
442 barrier(CLK_LOCAL_MEM_FENCE);
443 }
444 if(GRID_SIZE >= 4)
445 {
446 if(lid < 2)
447 {
448 tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
449 }
450 barrier(CLK_LOCAL_MEM_FENCE);
451 }
452 if(lid == 0)
453 {
454 max_val_vec = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
455 max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
456 max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
457 max_local = max_val_vec.s0;
458 }
459 barrier(CLK_LOCAL_MEM_FENCE);
460
461 /* Second section */
462
463 // Set sum vector
464 VEC_DATA_TYPE(DATA_TYPE, 4)
465 sum1D = 0;
466 DATA_TYPE max_val = max_local;
467
468 // Shift values, exp and sum
469 for(i = 0; i < width_; i++)
470 {
471 VEC_DATA_TYPE(DATA_TYPE, 4)
472 data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
473 data = SUB_OP(data, max_val, DATA_TYPE, 4);
474#ifdef BETA
475 data = MUL_OP(data, beta, DATA_TYPE, 4);
476#endif /* BETA */
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000477#ifdef LOG_SOFTMAX
478 VSTORE(4)
479 (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
480 data = EXP_OP(data, DATA_TYPE, 4);
481#else /* LOG_SOFTMAX */
Chunosovd6afedc2017-11-06 22:09:45 +0700482 data = EXP_OP(data, DATA_TYPE, 4);
483 VSTORE(4)
484 (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000485#endif /* LOG_SOFTMAX */
Chunosovd6afedc2017-11-06 22:09:45 +0700486 sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
487 }
488#ifdef NON_MULTIPLE_OF_GRID_SIZE
489 //TODO: Optimize the calculation (avoid %).
490 boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
491 if(lid < boundary_workitems)
492 {
493 VEC_DATA_TYPE(DATA_TYPE, 4)
494 data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
495 data = SUB_OP(data, max_val, DATA_TYPE, 4);
496#ifdef BETA
497 data = MUL_OP(data, beta, DATA_TYPE, 4);
498#endif /* BETA */
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000499#ifdef LOG_SOFTMAX
500 VSTORE(4)
501 (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
502 data = EXP_OP(data, DATA_TYPE, 4);
503#else /* LOG_SOFTMAX */
Chunosovd6afedc2017-11-06 22:09:45 +0700504 data = EXP_OP(data, DATA_TYPE, 4);
505 VSTORE(4)
506 (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000507#endif /* LOG_SOFTMAX */
Chunosovd6afedc2017-11-06 22:09:45 +0700508 sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
509 }
510#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
511 if(boundary_workitems == 0)
512 {
513 boundary_workitems = GRID_SIZE;
514 i--;
515 }
516 if(lid == (boundary_workitems - 1))
517 {
518 // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
519 VEC_DATA_TYPE(DATA_TYPE, 4)
520 data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
521 data = SUB_OP(data, max_val, DATA_TYPE, 4);
522#ifdef BETA
523 data = MUL_OP(data, beta, DATA_TYPE, 4);
524#endif /* BETA */
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000525#ifdef LOG_SOFTMAX
526 VSTORE(4)
527 (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
528 data = EXP_OP(data, DATA_TYPE, 4);
529 VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
530 widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
531 data = select(0, data, widx);
532#else /* LOG_SOFTMAX */
Chunosovd6afedc2017-11-06 22:09:45 +0700533 data = EXP_OP(data, DATA_TYPE, 4);
534 VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
535 widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
536 data = select(0, data, widx);
537 VSTORE(4)
538 (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
Sang-Hoon Park62eeb532019-10-29 13:13:19 +0000539#endif /* LOG_SOFTMAX */
Chunosovd6afedc2017-11-06 22:09:45 +0700540 sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
541 }
542#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
543#endif /* NON_MULTIPLE_OF_GRID_SIZE */
544 tmp_local[lid] = sum1D;
545
546 barrier(CLK_LOCAL_MEM_FENCE);
547
548 if(GRID_SIZE >= 256)
549 {
550 if(lid < 128)
551 {
552 tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
553 }
554 barrier(CLK_LOCAL_MEM_FENCE);
555 }
556 if(GRID_SIZE >= 128)
557 {
558 if(lid < 64)
559 {
560 tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
561 }
562 barrier(CLK_LOCAL_MEM_FENCE);
563 }
564 if(GRID_SIZE >= 64)
565 {
566 if(lid < 32)
567 {
568 tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
569 }
570 barrier(CLK_LOCAL_MEM_FENCE);
571 }
572 if(GRID_SIZE >= 32)
573 {
574 if(lid < 16)
575 {
576 tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
577 }
578 barrier(CLK_LOCAL_MEM_FENCE);
579 }
580 if(GRID_SIZE >= 16)
581 {
582 if(lid < 8)
583 {
584 tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
585 }
586 barrier(CLK_LOCAL_MEM_FENCE);
587 }
588 if(GRID_SIZE >= 8)
589 {
590 if(lid < 4)
591 {
592 tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
593 }
594 barrier(CLK_LOCAL_MEM_FENCE);
595 }
596 if(GRID_SIZE >= 4)
597 {
598 if(lid < 2)
599 {
600 tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
601 }
602 barrier(CLK_LOCAL_MEM_FENCE);
603 }
604 if(lid == 0)
605 {
606 sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
607 // Perform max reduction
608 sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
609 sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
610 *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
611 }
612}