blob: 5bc43ef1442a6cfe108f1ea1b662901191cc9757 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
2 * Copyright (c) 2017 ARM Limited.
3 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Anthony Barbierac69aa12017-07-03 17:39:37 +010026#ifdef FIXED_POINT_POSITION
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010027
28#include "fixed_point.h"
29#define MAX_OP(x, y, type, size) MAX_OP_EXPAND(x, y, type, size)
30#define ADD_OP(x, y, type, size) ADD_SAT_OP_EXPAND((x), (y), type, size)
31#define SUB_OP(x, y, type, size) SUB_SAT_OP_EXPAND((x), (y), type, size)
Pablo Palmier48a60f92017-10-18 11:03:08 +010032#define MUL_OP(x, y, type, size) MUL_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
steniu010c7614f2017-06-23 17:00:26 +010033#define DIV_OP(x, y, type, size) DIV_SAT_OP_VEC_EXPAND((x), (y), type, size, FIXED_POINT_POSITION)
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010034#define EXP_OP(x, type, size) EXP_OP_EXPAND((x), type, size, FIXED_POINT_POSITION)
35
36#define MIN_VAL_EXPAND(type) type##_MIN
37#define MIN_VAL(type) MIN_VAL_EXPAND(type)
38#define MINVAL MIN_VAL(DATA_TYPE)
39#define SELECT_DATA_TYPE EXPAND(DATA_TYPE)
40
Anthony Barbierac69aa12017-07-03 17:39:37 +010041#else /* FIXED_POINT_POSITION */
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010042
43#define MAX_OP(x, y, type, size) max((x), (y))
44#define ADD_OP(x, y, type, size) ((x) + (y))
45#define SUB_OP(x, y, type, size) ((x) - (y))
Pablo Palmier48a60f92017-10-18 11:03:08 +010046#define MUL_OP(x, y, type, size) ((x) * (y))
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010047#define DIV_OP(x, y, type, size) ((x) / (y))
48#define EXP_OP(x, type, size) exp((x))
49
Anthony Barbierac69aa12017-07-03 17:39:37 +010050#ifdef USE_F16
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010051#define MINVAL -HALF_MAX
52#define SELECT_DATA_TYPE short
Anthony Barbierac69aa12017-07-03 17:39:37 +010053#else /* USE_F16 */
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010054#define MINVAL -FLT_MAX
Anthony Barbier6ff3b192017-09-04 18:44:23 +010055#define SELECT_DATA_TYPE int
Anthony Barbierac69aa12017-07-03 17:39:37 +010056#endif /* USE_F16 */
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010057
Anthony Barbierac69aa12017-07-03 17:39:37 +010058#endif /* FIXED_POINT_POSITION */
Anthony Barbier6ff3b192017-09-04 18:44:23 +010059
Chunosovd6afedc2017-11-06 22:09:45 +070060/* Number of workitems in dimension 0. */
61#if !defined(GRID_SIZE)
62#define GRID_SIZE 1
63#endif /* !defined(GRID_SIZE) */
64
65/* Vector size, i.e. number of vector elements. */
66#if VECTOR_SIZE == 2
67__constant VEC_DATA_TYPE(DATA_TYPE, 2) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 2))(MINVAL);
68__constant uint2 idx__ = (uint2)(0, 1);
69
70#elif VECTOR_SIZE == 4
71__constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
72__constant uint4 idx__ = (uint4)(0, 1, 2, 3);
73
74#elif VECTOR_SIZE == 8
75__constant VEC_DATA_TYPE(DATA_TYPE, 8) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 8))(MINVAL);
76__constant uint8 idx__ = (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
77
78#else /* VECTOR_SIZE DEFAULT */
79#define VECTOR_SIZE 16
80#define LOG_VECTOR_SIZE 4
81__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min_ = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
82__constant uint16 idx__ = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
83
84#endif /* VECTOR_SIZE END */
85
86// TODO (COMPMID-661): Remove if the non-fused kernels are removed
Anthony Barbier6ff3b192017-09-04 18:44:23 +010087__constant VEC_DATA_TYPE(DATA_TYPE, 16) type_min = (VEC_DATA_TYPE(DATA_TYPE, 16))(MINVAL);
88__constant uint16 idx16 = (uint16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
Chunosovd6afedc2017-11-06 22:09:45 +070089__constant uint4 idx4 = (uint4)(0, 1, 2, 3);
Anthony Barbier6ff3b192017-09-04 18:44:23 +010090
91/** Identifies the maximum value across the 1st dimension.
92 *
93 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010094 * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
Anthony Barbier6ff3b192017-09-04 18:44:23 +010095 * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
96 *
Georgios Pinitas09796752017-07-10 16:05:21 +010097 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
Anthony Barbier6ff3b192017-09-04 18:44:23 +010098 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
99 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
100 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
101 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
steniu010d523cc2017-07-13 14:24:23 +0100102 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
103 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100104 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100105 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100106 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
107 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
108 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
109 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
steniu010d523cc2017-07-13 14:24:23 +0100110 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
111 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100112 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
113 * @param[in] width Input image width
114 */
115__kernel void softmax_layer_max(
steniu010d523cc2017-07-13 14:24:23 +0100116 TENSOR3D_DECLARATION(src),
117 TENSOR3D_DECLARATION(dst),
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100118 uint width)
119{
steniu010d523cc2017-07-13 14:24:23 +0100120 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
121 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100122
123 // Initialize local maximum
124 VEC_DATA_TYPE(DATA_TYPE, 16)
125 max_val = (VEC_DATA_TYPE(DATA_TYPE, 16))type_min;
126
127 // Calculate max of row
128 const uint width4 = width >> 4;
129 for(uint i = 0; i < width4; i++)
130 {
131 VEC_DATA_TYPE(DATA_TYPE, 16)
132 data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100133 max_val = MAX_OP(data, max_val, DATA_TYPE, 16);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100134 }
135
Anthony Barbierac69aa12017-07-03 17:39:37 +0100136#ifdef NON_MULTIPLE_OF_16
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100137 // Handle non multiple of 16
138 VEC_DATA_TYPE(DATA_TYPE, 16)
139 data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
140 VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
141 widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100142 max_val = MAX_OP(max_val, select(type_min, data, widx), DATA_TYPE, 16);
Anthony Barbierac69aa12017-07-03 17:39:37 +0100143#endif /* NON_MULTIPLE_OF_16 */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100144
145 // Perform max reduction
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100146 max_val.s01234567 = MAX_OP(max_val.s01234567, max_val.s89ABCDEF, DATA_TYPE, 8);
147 max_val.s0123 = MAX_OP(max_val.s0123, max_val.s4567, DATA_TYPE, 4);
148 max_val.s01 = MAX_OP(max_val.s01, max_val.s23, DATA_TYPE, 2);
149 max_val.s0 = MAX_OP(max_val.s0, max_val.s1, DATA_TYPE, 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100150
151 // Store result
152 *((__global DATA_TYPE *)dst.ptr) = max_val.s0;
153}
154
155/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
156 * then gets the exponent of each element as sums all elements across each row.
157 *
158 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100159 * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100160 * @note In case the input is not multiple of 16 -DNON_MULTIPLE_OF_16 must be passed.
Pablo Palmier48a60f92017-10-18 11:03:08 +0100161 * @note Beta can be optionally passed at compile time using -DBETA (if undefined, assume it equals 1.0)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100162 *
Georgios Pinitas09796752017-07-10 16:05:21 +0100163 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100164 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
165 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
166 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
167 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
steniu010d523cc2017-07-13 14:24:23 +0100168 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
169 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100170 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100171 * @param[in] max_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100172 * @param[in] max_stride_x Stride of the max values tensor in X dimension (in bytes)
173 * @param[in] max_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
174 * @param[in] max_stride_y Stride of the max values tensor in Y dimension (in bytes)
175 * @param[in] max_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
steniu010d523cc2017-07-13 14:24:23 +0100176 * @param[in] max_stride_z Stride of the max values tensor in Z dimension (in bytes)
177 * @param[in] max_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100178 * @param[in] max_offset_first_element_in_bytes The offset of the first element in the max values tensor
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100179 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100180 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
181 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
182 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
183 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
steniu010d523cc2017-07-13 14:24:23 +0100184 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
185 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100186 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100187 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100188 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
189 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
190 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
steniu010d523cc2017-07-13 14:24:23 +0100191 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
192 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
193 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100194 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
195 * @param[in] width Input image width
196 */
197__kernel void softmax_layer_shift_exp_sum(
steniu010d523cc2017-07-13 14:24:23 +0100198 TENSOR3D_DECLARATION(src),
199 TENSOR3D_DECLARATION(max),
200 TENSOR3D_DECLARATION(dst),
201 TENSOR3D_DECLARATION(sum),
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100202 uint width)
203{
steniu010d523cc2017-07-13 14:24:23 +0100204 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
205 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
206 Image max = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(max);
207 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100208
Pablo Palmier48a60f92017-10-18 11:03:08 +0100209#ifdef BETA
210 // Initialize beta
211 VEC_DATA_TYPE(DATA_TYPE, 16)
212 beta = (VEC_DATA_TYPE(DATA_TYPE, 16))BETA;
213#endif /* BETA */
214
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100215 // Load max value of 1D logits vector (row)
216 DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&max, 0, 0));
217
218 // Set sum vector
219 VEC_DATA_TYPE(DATA_TYPE, 16)
220 sum1D = 0;
221
222 // Shift values, exp and sum
223 const uint width4 = width >> 4;
224 for(uint i = 0; i < width4; i++)
225 {
226 VEC_DATA_TYPE(DATA_TYPE, 16)
227 data = vload16(0, (__global DATA_TYPE *)offset(&src, i << 4, 0));
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100228 data = SUB_OP(data, max_val, DATA_TYPE, 16);
Pablo Palmier48a60f92017-10-18 11:03:08 +0100229#ifdef BETA
230 data = MUL_OP(data, beta, DATA_TYPE, 16);
231#endif /* BETA */
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100232 data = EXP_OP(data, DATA_TYPE, 16);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100233 vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, i << 4, 0));
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100234 sum1D = ADD_OP(sum1D, data, DATA_TYPE, 16);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100235 }
236
Anthony Barbierac69aa12017-07-03 17:39:37 +0100237#ifdef NON_MULTIPLE_OF_16
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100238 // Handle non multiple of 16
239 VEC_DATA_TYPE(DATA_TYPE, 16)
240 data = vload16(0, (__global DATA_TYPE *)offset(&src, width4 << 4, 0));
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100241 data = SUB_OP(data, max_val, DATA_TYPE, 16);
Pablo Palmier48a60f92017-10-18 11:03:08 +0100242#ifdef BETA
243 data = MUL_OP(data, beta, DATA_TYPE, 16);
244#endif /* BETA */
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100245 data = EXP_OP(data, DATA_TYPE, 16);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100246 VEC_DATA_TYPE(SELECT_DATA_TYPE, 16)
247 widx = CONVERT(((uint16)(width4 << 4) + idx16) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 16));
248 data = select(0, data, widx);
249 vstore16(data, 0, (__global DATA_TYPE *)offset(&dst, width4 << 4, 0));
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100250 sum1D = ADD_OP(sum1D, data, DATA_TYPE, 16);
Anthony Barbierac69aa12017-07-03 17:39:37 +0100251#endif /* NON_MULTIPLE_OF_16 */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100252
253 // Perform min/max reduction
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100254 sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
255 sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
256 sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
257 sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100258
259 // Calculate and store result
260 *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
261}
262
263/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
264 *
265 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100266 * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100267 *
Georgios Pinitas09796752017-07-10 16:05:21 +0100268 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100269 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
270 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
271 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
272 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
steniu010d523cc2017-07-13 14:24:23 +0100273 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
274 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100275 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100276 * @param[in] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100277 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
278 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
279 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
280 * @param[in] sum_step_y sum_stride_y * number of elements along Y processed per workitem(in bytes)
steniu010d523cc2017-07-13 14:24:23 +0100281 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
282 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100283 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100284 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100285 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
286 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
287 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
288 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
steniu010d523cc2017-07-13 14:24:23 +0100289 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
290 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100291 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
292 */
293__kernel void softmax_layer_norm(
steniu010d523cc2017-07-13 14:24:23 +0100294 TENSOR3D_DECLARATION(src),
295 TENSOR3D_DECLARATION(sum),
296 TENSOR3D_DECLARATION(dst))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100297{
steniu010d523cc2017-07-13 14:24:23 +0100298 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
299 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
300 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100301
302 // Load max value of 1D logits vector (row)
303 DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
304 VEC_DATA_TYPE(DATA_TYPE, 16)
305 data = vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0));
Georgios Pinitase5f8fd62017-06-23 18:03:44 +0100306 vstore16(DIV_OP(data, sum_val, DATA_TYPE, 16), 0, (__global DATA_TYPE *)offset(&dst, 0, 0));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100307}
Chunosovd6afedc2017-11-06 22:09:45 +0700308
309/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
310 * then gets the exponent of each element as sums all elements across each row.
311 *
312 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
313 * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
314 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
315 * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
316 *
317 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
318 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
319 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
320 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
321 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
322 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
323 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
324 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
325 * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
326 * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)
327 * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
328 * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)
329 * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
330 * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)
331 * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
332 * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
333 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
334 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
335 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
336 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
337 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
338 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
339 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
340 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
341 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
342 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
343 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
344 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
345 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
346 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
347 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
348 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
349 * @param[in] width Input image width
350 */
351__kernel void softmax_layer_max_shift_exp_sum_serial(
352 TENSOR3D_DECLARATION(src),
353 TENSOR3D_DECLARATION(maxo),
354 TENSOR3D_DECLARATION(dst),
355 TENSOR3D_DECLARATION(sum),
356 uint width)
357{
358 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
359 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
360 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
361 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
362
363#ifdef BETA
364 // Initialize beta
365 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
366 beta = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))BETA_VAL;
367#endif /* BETA */
368
369 // Initialize local maximum
370 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
371 max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE))type_min_;
372
373 // Calculate max of row
374 const uint width_ = width >> LOG_VECTOR_SIZE;
375 for(uint i = 0; i < width_; i++)
376 {
377 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
378 data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
379 max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, VECTOR_SIZE);
380 }
381
382#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
383 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
384 data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
385 VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE)
386 widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
387 max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, VECTOR_SIZE);
388#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
389
390 // Perform max reduction
391#if VECTOR_SIZE == 16
392 max_val_vec.s01234567 = MAX_OP(max_val_vec.s01234567, max_val_vec.s89ABCDEF, DATA_TYPE, 8);
393#endif /* VECTOR SIZE 16 END */
394#if VECTOR_SIZE >= 8
395 max_val_vec.s0123 = MAX_OP(max_val_vec.s0123, max_val_vec.s4567, DATA_TYPE, 4);
396#endif /* VECTOR SIZE 8 END */
397#if VECTOR_SIZE >= 4
398 max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
399#endif /* VECTOR SIZE 4 END */
400 max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
401 // Store result
402 *((__global DATA_TYPE *)maxo.ptr) = max_val_vec.s0;
403
404 /* Second section */
405
406 // Load max value of 1D logits vector (row)
407 DATA_TYPE max_val = *((__global DATA_TYPE *)offset(&maxo, 0, 0));
408
409 // Set sum vector
410 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
411 sum1D = 0;
412
413 // Shift values, exp and sum
414 for(uint i = 0; i < width_; i++)
415 {
416 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
417 data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, i << LOG_VECTOR_SIZE, 0));
418 data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
419#ifdef BETA
420 data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
421#endif /* BETA */
422 data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
423 VSTORE(VECTOR_SIZE)
424 (data, 0, (__global DATA_TYPE *)offset(&dst, i << LOG_VECTOR_SIZE, 0));
425 sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
426 }
427
428#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
429 VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
430 data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)offset(&src, width_ << LOG_VECTOR_SIZE, 0));
431 data = SUB_OP(data, max_val, DATA_TYPE, VECTOR_SIZE);
432#ifdef BETA
433 data = MUL_OP(data, beta, DATA_TYPE, VECTOR_SIZE);
434#endif /* BETA */
435 data = EXP_OP(data, DATA_TYPE, VECTOR_SIZE);
436 widx = CONVERT((EXPAND((CL_VEC_DATA_TYPE(uint, VECTOR_SIZE)))(width_ << LOG_VECTOR_SIZE) + idx__) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, VECTOR_SIZE));
437 data = select(0, data, widx);
438 VSTORE(VECTOR_SIZE)
439 (data, 0, (__global DATA_TYPE *)offset(&dst, width_ << LOG_VECTOR_SIZE, 0));
440 sum1D = ADD_OP(sum1D, data, DATA_TYPE, VECTOR_SIZE);
441#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
442
443 // Perform sum reduction
444#if VECTOR_SIZE == 16
445 sum1D.s01234567 = ADD_OP(sum1D.s01234567, sum1D.s89ABCDEF, DATA_TYPE, 8);
446#endif /* VECTOR SIZE 16 END */
447#if VECTOR_SIZE >= 8
448 sum1D.s0123 = ADD_OP(sum1D.s0123, sum1D.s4567, DATA_TYPE, 4);
449#endif /* VECTOR SIZE 8 END */
450#if VECTOR_SIZE >= 4
451 sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
452#endif /* VECTOR SIZE 4 END */
453 sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
454
455 // Calculate and store result
456 *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
457}
458
459/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
460 * then gets the exponent of each element as sums all elements across each row.
461 *
462 * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
463 * @note Fixed point position must be given as a preprocessor argument using -DFIXED_POINT_POSITION=pos. e.g. DFIXED_POINT_POSITION=4
464 * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
465 * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
466 *
467 * @param[in] src_ptr Pointer to the source tensor slice. Supported data types: QS8/QS16/F16/F32
468 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
469 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
470 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
471 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
472 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
473 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
474 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
475 * @param[in] maxo_ptr Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
476 * @param[in] maxo_stride_x Stride of the max values tensor in X dimension (in bytes)
477 * @param[in] maxo_step_x max_stride_x * number of elements along X processed per workitem(in bytes)
478 * @param[in] maxo_stride_y Stride of the max values tensor in Y dimension (in bytes)
479 * @param[in] maxo_step_y max_stride_y * number of elements along Y processed per workitem(in bytes)
480 * @param[in] maxo_stride_z Stride of the max values tensor in Z dimension (in bytes)
481 * @param[in] maxo_step_z max_stride_z * number of elements along Z processed per workitem(in bytes)
482 * @param[in] maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
483 * @param[out] dst_ptr Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
484 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
485 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
486 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
487 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
488 * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
489 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
490 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
491 * @param[out] sum_ptr Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
492 * @param[in] sum_stride_x Stride of the sum values tensor in X dimension (in bytes)
493 * @param[in] sum_step_x sum_stride_x * number of elements along X processed per workitem(in bytes)
494 * @param[in] sum_stride_y Stride of the sum values tensor in Y dimension (in bytes)
495 * @param[in] sum_step_y sum_stride_z * number of elements along Z processed per workitem(in bytes)
496 * @param[in] sum_stride_z Stride of the sum values tensor in Z dimension (in bytes)
497 * @param[in] sum_step_z sum_stride_z * number of elements along Z processed per workitem(in bytes)
498 * @param[in] sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
499 * @param[in] width Input image width
500 */
501__kernel void softmax_layer_max_shift_exp_sum_parallel(
502 TENSOR3D_DECLARATION(src),
503 TENSOR3D_DECLARATION(maxo),
504 TENSOR3D_DECLARATION(dst),
505 TENSOR3D_DECLARATION(sum),
506 uint width)
507{
508 Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);
509 Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);
510 Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
511 Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
512
513 const uint lid = get_local_id(0);
514
515#ifdef BETA
516 // Initialize beta
517 VEC_DATA_TYPE(DATA_TYPE, 4)
518 beta = (VEC_DATA_TYPE(DATA_TYPE, 4))BETA;
519#endif /* BETA */
520
521 // Define one temporary vector per work-item.
522 __local VEC_DATA_TYPE(DATA_TYPE, 4) tmp_local[GRID_SIZE];
523 __local DATA_TYPE max_local;
524
525 __constant VEC_DATA_TYPE(DATA_TYPE, 4) type_min4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(MINVAL);
526 VEC_DATA_TYPE(DATA_TYPE, 4)
527 max_val_vec = (VEC_DATA_TYPE(DATA_TYPE, 4))type_min4;
528 // Number of elements per work-item.
529 const uint row = width / GRID_SIZE;
530 // Number of iterations per work-item.
531 const uint width_ = row >> 2;
532 // Calculate max of row
533 uint i = 0;
534 for(; i < width_; i++)
535 {
536 VEC_DATA_TYPE(DATA_TYPE, 4)
537 data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
538 max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
539 }
540#ifdef NON_MULTIPLE_OF_GRID_SIZE
541 // How many work-items needed to complete the computation.
542 //TODO: Optimize this calculation (avoid %).
543 int boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
544 if(lid < boundary_workitems)
545 {
546 VEC_DATA_TYPE(DATA_TYPE, 4)
547 data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
548 max_val_vec = MAX_OP(data_max, max_val_vec, DATA_TYPE, 4);
549 }
550#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
551 if(boundary_workitems == 0)
552 {
553 boundary_workitems = GRID_SIZE;
554 i--;
555 }
556 if(lid == (boundary_workitems - 1))
557 {
558 // Handle non multiple of 4
559 VEC_DATA_TYPE(DATA_TYPE, 4)
560 data_max = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
561 VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
562 widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
563 max_val_vec = MAX_OP(max_val_vec, select(type_min_, data_max, widx), DATA_TYPE, 4);
564 }
565#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
566#endif /* NON_MULTIPLE_OF_GRID_SIZE */
567 tmp_local[lid] = max_val_vec;
568
569 barrier(CLK_LOCAL_MEM_FENCE);
570
571 if(GRID_SIZE >= 256)
572 {
573 if(lid < 128)
574 {
575 tmp_local[lid] = MAX_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
576 }
577 barrier(CLK_LOCAL_MEM_FENCE);
578 }
579 if(GRID_SIZE >= 128)
580 {
581 if(lid < 64)
582 {
583 tmp_local[lid] = MAX_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
584 }
585 barrier(CLK_LOCAL_MEM_FENCE);
586 }
587 if(GRID_SIZE >= 64)
588 {
589 if(lid < 32)
590 {
591 tmp_local[lid] = MAX_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
592 }
593 barrier(CLK_LOCAL_MEM_FENCE);
594 }
595 if(GRID_SIZE >= 32)
596 {
597 if(lid < 16)
598 {
599 tmp_local[lid] = MAX_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
600 }
601 barrier(CLK_LOCAL_MEM_FENCE);
602 }
603 if(GRID_SIZE >= 16)
604 {
605 if(lid < 8)
606 {
607 tmp_local[lid] = MAX_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
608 }
609 barrier(CLK_LOCAL_MEM_FENCE);
610 }
611 if(GRID_SIZE >= 8)
612 {
613 if(lid < 4)
614 {
615 tmp_local[lid] = MAX_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
616 }
617 barrier(CLK_LOCAL_MEM_FENCE);
618 }
619 if(GRID_SIZE >= 4)
620 {
621 if(lid < 2)
622 {
623 tmp_local[lid] = MAX_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
624 }
625 barrier(CLK_LOCAL_MEM_FENCE);
626 }
627 if(lid == 0)
628 {
629 max_val_vec = MAX_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
630 max_val_vec.s01 = MAX_OP(max_val_vec.s01, max_val_vec.s23, DATA_TYPE, 2);
631 max_val_vec.s0 = MAX_OP(max_val_vec.s0, max_val_vec.s1, DATA_TYPE, 1);
632 max_local = max_val_vec.s0;
633 }
634 barrier(CLK_LOCAL_MEM_FENCE);
635
636 /* Second section */
637
638 // Set sum vector
639 VEC_DATA_TYPE(DATA_TYPE, 4)
640 sum1D = 0;
641 DATA_TYPE max_val = max_local;
642
643 // Shift values, exp and sum
644 for(i = 0; i < width_; i++)
645 {
646 VEC_DATA_TYPE(DATA_TYPE, 4)
647 data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
648 data = SUB_OP(data, max_val, DATA_TYPE, 4);
649#ifdef BETA
650 data = MUL_OP(data, beta, DATA_TYPE, 4);
651#endif /* BETA */
652 data = EXP_OP(data, DATA_TYPE, 4);
653 VSTORE(4)
654 (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
655 sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
656 }
657#ifdef NON_MULTIPLE_OF_GRID_SIZE
658 //TODO: Optimize the calculation (avoid %).
659 boundary_workitems = (width % (GRID_SIZE * 4)) / 4;
660 if(lid < boundary_workitems)
661 {
662 VEC_DATA_TYPE(DATA_TYPE, 4)
663 data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, i * GRID_SIZE * 4, 0));
664 data = SUB_OP(data, max_val, DATA_TYPE, 4);
665#ifdef BETA
666 data = MUL_OP(data, beta, DATA_TYPE, 4);
667#endif /* BETA */
668 data = EXP_OP(data, DATA_TYPE, 4);
669 VSTORE(4)
670 (data, 0, (__global DATA_TYPE *)offset(&dst, i * GRID_SIZE * 4, 0));
671 sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
672 }
673#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
674 if(boundary_workitems == 0)
675 {
676 boundary_workitems = GRID_SIZE;
677 i--;
678 }
679 if(lid == (boundary_workitems - 1))
680 {
681 // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
682 VEC_DATA_TYPE(DATA_TYPE, 4)
683 data = VLOAD(4)(0, (__global DATA_TYPE *)offset(&src, (GRID_SIZE * i * 4) + 4, 0));
684 data = SUB_OP(data, max_val, DATA_TYPE, 4);
685#ifdef BETA
686 data = MUL_OP(data, beta, DATA_TYPE, 4);
687#endif /* BETA */
688 data = EXP_OP(data, DATA_TYPE, 4);
689 VEC_DATA_TYPE(SELECT_DATA_TYPE, 4)
690 widx = CONVERT(((uint4)(GRID_SIZE * i * 4) + boundary_workitems * 4 + idx4) < width, VEC_DATA_TYPE(SELECT_DATA_TYPE, 4));
691 data = select(0, data, widx);
692 VSTORE(4)
693 (data, 0, (__global DATA_TYPE *)offset(&dst, (GRID_SIZE * i * 4) + 4, 0));
694 sum1D = ADD_OP(sum1D, data, DATA_TYPE, 4);
695 }
696#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
697#endif /* NON_MULTIPLE_OF_GRID_SIZE */
698 tmp_local[lid] = sum1D;
699
700 barrier(CLK_LOCAL_MEM_FENCE);
701
702 if(GRID_SIZE >= 256)
703 {
704 if(lid < 128)
705 {
706 tmp_local[lid] = ADD_OP(tmp_local[lid + 128], tmp_local[lid], DATA_TYPE, 4);
707 }
708 barrier(CLK_LOCAL_MEM_FENCE);
709 }
710 if(GRID_SIZE >= 128)
711 {
712 if(lid < 64)
713 {
714 tmp_local[lid] = ADD_OP(tmp_local[lid + 64], tmp_local[lid], DATA_TYPE, 4);
715 }
716 barrier(CLK_LOCAL_MEM_FENCE);
717 }
718 if(GRID_SIZE >= 64)
719 {
720 if(lid < 32)
721 {
722 tmp_local[lid] = ADD_OP(tmp_local[lid + 32], tmp_local[lid], DATA_TYPE, 4);
723 }
724 barrier(CLK_LOCAL_MEM_FENCE);
725 }
726 if(GRID_SIZE >= 32)
727 {
728 if(lid < 16)
729 {
730 tmp_local[lid] = ADD_OP(tmp_local[lid + 16], tmp_local[lid], DATA_TYPE, 4);
731 }
732 barrier(CLK_LOCAL_MEM_FENCE);
733 }
734 if(GRID_SIZE >= 16)
735 {
736 if(lid < 8)
737 {
738 tmp_local[lid] = ADD_OP(tmp_local[lid + 8], tmp_local[lid], DATA_TYPE, 4);
739 }
740 barrier(CLK_LOCAL_MEM_FENCE);
741 }
742 if(GRID_SIZE >= 8)
743 {
744 if(lid < 4)
745 {
746 tmp_local[lid] = ADD_OP(tmp_local[lid + 4], tmp_local[lid], DATA_TYPE, 4);
747 }
748 barrier(CLK_LOCAL_MEM_FENCE);
749 }
750 if(GRID_SIZE >= 4)
751 {
752 if(lid < 2)
753 {
754 tmp_local[lid] = ADD_OP(tmp_local[lid + 2], tmp_local[lid], DATA_TYPE, 4);
755 }
756 barrier(CLK_LOCAL_MEM_FENCE);
757 }
758 if(lid == 0)
759 {
760 sum1D = ADD_OP(tmp_local[lid + 1], tmp_local[lid], DATA_TYPE, 4);
761 // Perform max reduction
762 sum1D.s01 = ADD_OP(sum1D.s01, sum1D.s23, DATA_TYPE, 2);
763 sum1D.s0 = ADD_OP(sum1D.s0, sum1D.s1, DATA_TYPE, 1);
764 *((__global DATA_TYPE *)sum.ptr) = sum1D.s0;
765 }
766}