blob: e549b442457608d89166e5f030f873b657af33ff [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Giorgio Arena72f39be2018-02-19 15:33:41 +00002 * Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010026#define MAX_OP(x, y, type, size) max((x), (y))
27#define ADD_OP(x, y, type, size) ((x) + (y))
28#define SUB_OP(x, y, type, size) ((x) - (y))
Pablo Palmier48a60f92017-10-18 11:03:08 +010029#define MUL_OP(x, y, type, size) ((x) * (y))
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010030#define DIV_OP(x, y, type, size) ((x) / (y))
31#define EXP_OP(x, type, size) exp((x))
32
Anthony Barbierac69aa12017-07-03 17:39:37 +010033#ifdef USE_F16
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010034#define MINVAL -HALF_MAX
35#define SELECT_DATA_TYPE short
Anthony Barbierac69aa12017-07-03 17:39:37 +010036#else /* USE_F16 */
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010037#define MINVAL -FLT_MAX
Anthony Barbier6ff3b192017-09-04 18:44:23 +010038#define SELECT_DATA_TYPE int
Anthony Barbierac69aa12017-07-03 17:39:37 +010039#endif /* USE_F16 */
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010040
Chunosovd6afedc2017-11-06 22:09:45 +070041/* Number of workitems in dimension 0. */
42#if !defined(GRID_SIZE)
43#define GRID_SIZE 1
44#endif /* !defined(GRID_SIZE) */
45
46/* Vector size, i.e. number of vector elements. */
47#if VECTOR_SIZE == 2
48__constant VEC_DATA_TYPE(DATA_TYPE, 2) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 2))(MINVAL);
49__constant uint2 idx__ = (uint2)(0, 1);
50
51#elif VECTOR_SIZE == 4
52__constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
53__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
54
55#elif VECTOR_SIZE == 8
56__constant VEC_DATA_TYPE(DATA_TYPE, 8) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 8))(MINVAL);
57__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
58
59#else /* VECTOR_SIZE DEFAULT */
60#define VECTOR_SIZE 16
61#define LOG_VECTOR_SIZE 4
62__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
63__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
64
65#endif /* VECTOR_SIZE END */
66
67// TODO (COMPMID-661): Remove if the non-fused kernels are removed
Anthony Barbier6ff3b192017-09-04 18:44:23 +010068__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
69__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
Chunosovd6afedc2017-11-06 22:09:45 +070070__constant uint4 idx4 = (uint4)(0, 1, 2, 3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010071
Anthony Barbier6ff3b192017-09-04 18:44:23 +010072/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
73 *
74 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
75 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +010076 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
Anthony Barbier6ff3b192017-09-04 18:44:23 +010077 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
78 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
79 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
80 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
steniu010d523cc2017-07-13 14:24:23 +010081 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
82 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010083 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010084 * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +010085 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
86 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
87 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
88 * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
steniu010d523cc2017-07-13 14:24:23 +010089 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
90 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010091 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010092 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +010093 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
94 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
95 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
96 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
steniu010d523cc2017-07-13 14:24:23 +010097 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
98 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010099 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
100 */
101__kernel void softmax_layer_norm(
steniu010d523cc2017-07-13 14:24:23 +0100102 TENSOR3D_DECLARATION(src),
103 TENSOR3D_DECLARATION(sum),
104 TENSOR3D_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100105{
steniu010d523cc2017-07-13 14:24:23 +0100106 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
107 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
108 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100109
110 // Load max value of 1D logits vector (row)
111 DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
112 VEC_DATA_TYPE(DATA_TYPE, 16)
113 data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100114 vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100115}
Chunosovd6afedc2017-11-06 22:09:45 +0700116
117/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
118 * then gets the exponent of each element as sums all elements across each row.
119 *
120 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
Chunosovd6afedc2017-11-06 22:09:45 +0700121 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
122 * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
123 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100124 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
Chunosovd6afedc2017-11-06 22:09:45 +0700125 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
126 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
127 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
128 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
129 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
130 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
131 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
132 * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
133 * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)
134 * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
135 * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)
136 * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
137 * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)
138 * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
139 * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
140 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
141 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
142 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
143 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
144 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
145 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
146 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
147 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
148 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
149 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
150 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
151 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
152 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
153 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
154 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
155 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
156 * @param[in] width Input image width
157 */
158__kernel void softmax_layer_max_shift_exp_sum_serial(
159 TENSOR3D_DECLARATION(src),
160 TENSOR3D_DECLARATION(maxo),
161 TENSOR3D_DECLARATION(dst),
162 TENSOR3D_DECLARATION(sum),
163 uint width)
164{
165 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
166 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
167 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
168 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
169
170#ifdef BETA
171 // Initialize beta
172 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
Georgios Pinitas4df76c92017-11-10 10:26:11 +0000173 beta = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))BETA;
Chunosovd6afedc2017-11-06 22:09:45 +0700174#endif /* BETA */
175
176 // Initialize local maximum
177 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
178 max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))type_min_;
179
180 // Calculate max of row
181 const uint width_ = width >> LOG_VECTOR_SIZE;
182 for(uint i = 0; i < width_; i++)
183 {
184 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
185 data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
186 max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, VECTOR_SIZE);
187 }
188
189#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
190 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
191 data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
192 VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)
193 widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
194 max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);
195#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
196
197 // Perform max reduction
198#if VECTOR_SIZE == 16
199 max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
200#endif /* VECTOR SIZE 16 END */
201#if VECTOR_SIZE >= 8
202 max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
203#endif /* VECTOR SIZE 8 END */
204#if VECTOR_SIZE >= 4
205 max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
206#endif /* VECTOR SIZE 4 END */
207 max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
208 // Store result
209 *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
210
211 /* Second section */
212
213 // Load max value of 1D logits vector (row)
214 DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));
215
216 // Set sum vector
217 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
218 sum1D = 0;
219
220 // Shift values, exp and sum
221 for(uint i = 0; i < width_; i++)
222 {
223 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
224 data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
225 data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
226#ifdef BETA
227 data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
228#endif /* BETA */
229 data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
230 VSTORE(VECTOR_SIZE)
231 (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
232 sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
233 }
234
235#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
236 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
237 data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
238 data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
239#ifdef BETA
240 data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
241#endif /* BETA */
242 data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
243 widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
244 data = select(0, data, widx);
245 VSTORE(VECTOR_SIZE)
246 (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
247 sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
248#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
249
250 // Perform sum reduction
251#if VECTOR_SIZE == 16
252 sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
253#endif /* VECTOR SIZE 16 END */
254#if VECTOR_SIZE >= 8
255 sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
256#endif /* VECTOR SIZE 8 END */
257#if VECTOR_SIZE >= 4
258 sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
259#endif /* VECTOR SIZE 4 END */
260 sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
261
262 // Calculate and store result
263 *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
264}
265
266/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
267 * then gets the exponent of each element as sums all elements across each row.
268 *
269 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
Chunosovd6afedc2017-11-06 22:09:45 +0700270 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
271 * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
272 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100273 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: F16/F32
Chunosovd6afedc2017-11-06 22:09:45 +0700274 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
275 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
276 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
277 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
278 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
279 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
280 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
281 * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
282 * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)
283 * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
284 * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)
285 * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
286 * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)
287 * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
288 * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
289 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
290 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
291 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
292 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
293 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
294 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
295 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
296 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
297 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
298 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
299 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
300 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
301 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
302 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
303 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
304 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
305 * @param[in] width Input image width
306 */
307__kernel void softmax_layer_max_shift_exp_sum_parallel(
308 TENSOR3D_DECLARATION(src),
309 TENSOR3D_DECLARATION(maxo),
310 TENSOR3D_DECLARATION(dst),
311 TENSOR3D_DECLARATION(sum),
312 uint width)
313{
314 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
315 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
316 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
317 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
318
319 const uint lid = get_local_id(0);
320
321#ifdef BETA
322 // Initialize beta
323 VEC_DATA_TYPE(DATA_TYPE, 4)
324 beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;
325#endif /* BETA */
326
327 // Define one temporary vector per work-item.
328 __local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];
329 __local DATA_TYPE max_local;
330
331 __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
332 VEC_DATA_TYPE(DATA_TYPE, 4)
333 max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;
334 // Number of elements per work-item.
335 const uint row = width / GRID_SIZE;
336 // Number of iterations per work-item.
337 const uint width_ = row >> 2;
338 // Calculate max of row
339 uint i = 0;
340 for(; i < width_; i++)
341 {
342 VEC_DATA_TYPE(DATA_TYPE, 4)
343 data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
344 max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
345 }
346#ifdef NON_MULTIPLE_OF_GRID_SIZE
347 // How many work-items needed to complete the computation.
348 //TODO: Optimize this calculation (avoid %).
349 int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
350 if(lid < boundary_workitems)
351 {
352 VEC_DATA_TYPE(DATA_TYPE, 4)
353 data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
354 max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
355 }
356#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
357 if(boundary_workitems == 0)
358 {
359 boundary_workitems = GRID_SIZE;
360 i--;
361 }
362 if(lid == (boundary_workitems - 1))
363 {
364 // Handle non multiple of 4
365 VEC_DATA_TYPE(DATA_TYPE, 4)
366 data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
367 VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
368 widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
369 max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);
370 }
371#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
372#endif /* NON_MULTIPLE_OF_GRID_SIZE */
373 tmp_local[lid] = max_val_vec;
374
375 barrier(CLK_LOCAL_MEM_FENCE);
376
377 if(GRID_SIZE >= 256)
378 {
379 if(lid < 128)
380 {
381 tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
382 }
383 barrier(CLK_LOCAL_MEM_FENCE);
384 }
385 if(GRID_SIZE >= 128)
386 {
387 if(lid < 64)
388 {
389 tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
390 }
391 barrier(CLK_LOCAL_MEM_FENCE);
392 }
393 if(GRID_SIZE >= 64)
394 {
395 if(lid < 32)
396 {
397 tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
398 }
399 barrier(CLK_LOCAL_MEM_FENCE);
400 }
401 if(GRID_SIZE >= 32)
402 {
403 if(lid < 16)
404 {
405 tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
406 }
407 barrier(CLK_LOCAL_MEM_FENCE);
408 }
409 if(GRID_SIZE >= 16)
410 {
411 if(lid < 8)
412 {
413 tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
414 }
415 barrier(CLK_LOCAL_MEM_FENCE);
416 }
417 if(GRID_SIZE >= 8)
418 {
419 if(lid < 4)
420 {
421 tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
422 }
423 barrier(CLK_LOCAL_MEM_FENCE);
424 }
425 if(GRID_SIZE >= 4)
426 {
427 if(lid < 2)
428 {
429 tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
430 }
431 barrier(CLK_LOCAL_MEM_FENCE);
432 }
433 if(lid == 0)
434 {
435 max_val_vec = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
436 max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
437 max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
438 max_local = max_val_vec.s0;
439 }
440 barrier(CLK_LOCAL_MEM_FENCE);
441
442 /* Second section */
443
444 // Set sum vector
445 VEC_DATA_TYPE(DATA_TYPE, 4)
446 sum1D = 0;
447 DATA_TYPE max_val = max_local;
448
449 // Shift values, exp and sum
450 for(i = 0; i < width_; i++)
451 {
452 VEC_DATA_TYPE(DATA_TYPE, 4)
453 data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
454 data = SUB_OP(data, max_val, DATA_TYPE, 4);
455#ifdef BETA
456 data = MUL_OP(data, beta, DATA_TYPE, 4);
457#endif /* BETA */
458 data = EXP_OP(data, DATA_TYPE, 4);
459 VSTORE(4)
460 (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
461 sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
462 }
463#ifdef NON_MULTIPLE_OF_GRID_SIZE
464 //TODO: Optimize the calculation (avoid %).
465 boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
466 if(lid < boundary_workitems)
467 {
468 VEC_DATA_TYPE(DATA_TYPE, 4)
469 data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
470 data = SUB_OP(data, max_val, DATA_TYPE, 4);
471#ifdef BETA
472 data = MUL_OP(data, beta, DATA_TYPE, 4);
473#endif /* BETA */
474 data = EXP_OP(data, DATA_TYPE, 4);
475 VSTORE(4)
476 (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
477 sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
478 }
479#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
480 if(boundary_workitems == 0)
481 {
482 boundary_workitems = GRID_SIZE;
483 i--;
484 }
485 if(lid == (boundary_workitems - 1))
486 {
487 // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
488 VEC_DATA_TYPE(DATA_TYPE, 4)
489 data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
490 data = SUB_OP(data, max_val, DATA_TYPE, 4);
491#ifdef BETA
492 data = MUL_OP(data, beta, DATA_TYPE, 4);
493#endif /* BETA */
494 data = EXP_OP(data, DATA_TYPE, 4);
495 VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
496 widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
497 data = select(0, data, widx);
498 VSTORE(4)
499 (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
500 sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
501 }
502#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
503#endif /* NON_MULTIPLE_OF_GRID_SIZE */
504 tmp_local[lid] = sum1D;
505
506 barrier(CLK_LOCAL_MEM_FENCE);
507
508 if(GRID_SIZE >= 256)
509 {
510 if(lid < 128)
511 {
512 tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
513 }
514 barrier(CLK_LOCAL_MEM_FENCE);
515 }
516 if(GRID_SIZE >= 128)
517 {
518 if(lid < 64)
519 {
520 tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
521 }
522 barrier(CLK_LOCAL_MEM_FENCE);
523 }
524 if(GRID_SIZE >= 64)
525 {
526 if(lid < 32)
527 {
528 tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
529 }
530 barrier(CLK_LOCAL_MEM_FENCE);
531 }
532 if(GRID_SIZE >= 32)
533 {
534 if(lid < 16)
535 {
536 tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
537 }
538 barrier(CLK_LOCAL_MEM_FENCE);
539 }
540 if(GRID_SIZE >= 16)
541 {
542 if(lid < 8)
543 {
544 tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
545 }
546 barrier(CLK_LOCAL_MEM_FENCE);
547 }
548 if(GRID_SIZE >= 8)
549 {
550 if(lid < 4)
551 {
552 tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
553 }
554 barrier(CLK_LOCAL_MEM_FENCE);
555 }
556 if(GRID_SIZE >= 4)
557 {
558 if(lid < 2)
559 {
560 tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
561 }
562 barrier(CLK_LOCAL_MEM_FENCE);
563 }
564 if(lid == 0)
565 {
566 sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
567 // Perform max reduction
568 sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
569 sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
570 *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
571 }
572}