blob: e69c3c35e9d32e6c0471ab87071afa8ebd7e06c3 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +010025#include "repeat.h"
Anthony Barbier6ff3b192017-09-04 18:44:23 +010026
Georgios Pinitascdf51452017-08-31 14:21:36 +010027#if defined(POOL_AVG) || defined(POOL_L2)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010028#define POOL_OP(x, y) ((x) + (y))
Georgios Pinitascdf51452017-08-31 14:21:36 +010029#else /* defined(POOL_AVG) || defined(POOL_L2) */
Anthony Barbier6ff3b192017-09-04 18:44:23 +010030#define POOL_OP(x, y) (fmax((x), (y)))
Georgios Pinitascdf51452017-08-31 14:21:36 +010031#endif /* defined(POOL_AVG) || defined(POOL_L2) */
32
33#if defined(POOL_L2)
34#define POW2_OP(x, vec_size) ((x) * (x))
35#else /* defined(POOL_L2) */
36#define POW2_OP(x, vec_size) (x)
37#endif /* defined(POOL_L2) */
Anthony Barbier6ff3b192017-09-04 18:44:23 +010038
steniu010c7614f2017-06-23 17:00:26 +010039#define DIV_OP(x, y) (x * (1.f / y))
Georgios Pinitascdf51452017-08-31 14:21:36 +010040#define SQRT_OP(x) sqrt((x))
steniu010c7614f2017-06-23 17:00:26 +010041
Gian Marco Iodicecb292832017-08-02 13:19:48 +010042#if STRIDE_X == 1
43#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
44#elif STRIDE_X == 2 /* STRIDE_X == 1 */
45#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output)
46#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */
47#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)
48#endif /* STRIDE_X == 3 */
49
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +010050#if defined(FP_MIXED_PRECISION)
51#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
52#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
53 CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
54#else /* defined(FP_MIXED_PRECISION) */
55#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
56#endif /* defined(FP_MIXED_PRECISION) */
57
58#define POOLING3x3_STRIDE1(res, input, output) \
59 ({ \
60 VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
61 data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \
62 VEC_DATA_TYPE(ACC_DATA_TYPE, 2) \
63 data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4); \
64 VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
65 data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \
66 VEC_DATA_TYPE(ACC_DATA_TYPE, 2) \
67 data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4); \
68 VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
69 data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
70 VEC_DATA_TYPE(ACC_DATA_TYPE, 2) \
71 data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4); \
72 data00 = POW2_OP(data00, 4); \
73 data01 = POW2_OP(data01, 2); \
74 data10 = POW2_OP(data10, 4); \
75 data11 = POW2_OP(data11, 2); \
76 data20 = POW2_OP(data20, 4); \
77 data21 = POW2_OP(data21, 2); \
78 \
79 VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
80 values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01212323); \
81 VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
82 values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01); \
83 VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
84 values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01212323); \
85 VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
86 values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01); \
87 VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
88 values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01212323); \
89 VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
90 values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01); \
91 \
92 values00 = POOL_OP(values00, values10); \
93 values01 = POOL_OP(values01, values11); \
94 values00 = POOL_OP(values00, values20); \
95 values01 = POOL_OP(values01, values21); \
96 \
97 res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
98 res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03)); \
99 })
100
101#define POOLING3x3_STRIDE2(res, input, output) \
102 ({ \
103 VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
104 data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \
105 ACC_DATA_TYPE data01 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8)); \
106 VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
107 data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \
108 ACC_DATA_TYPE data11 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8)); \
109 VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
110 data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
111 ACC_DATA_TYPE data21 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8)); \
112 data00 = POW2_OP(data00, 8); \
113 data01 = POW2_OP(data01, 1); \
114 data10 = POW2_OP(data10, 8); \
115 data11 = POW2_OP(data11, 1); \
116 data20 = POW2_OP(data20, 8); \
117 data21 = POW2_OP(data21, 1); \
118 \
119 VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
120 values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01223445); \
121 VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
122 values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s667, data01); \
123 VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
124 values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01223445); \
125 VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
126 values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data10.s667, data11); \
127 VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
128 values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01223445); \
129 VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
130 values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data20.s667, data21); \
131 \
132 values00 = POOL_OP(values00, values10); \
133 values01 = POOL_OP(values01, values11); \
134 values00 = POOL_OP(values00, values20); \
135 values01 = POOL_OP(values01, values21); \
136 \
137 res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
138 res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03)); \
139 })
140
141#define POOLING3x3_STRIDE3(res, input, output) \
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100142 ({ \
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100143 VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
144 data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \
145 VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
146 data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \
147 VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
148 data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \
149 VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
150 data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \
151 VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
152 data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
153 VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
154 data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \
155 data00 = POW2_OP(data00, 8); \
156 data01 = POW2_OP(data01, 4); \
157 data10 = POW2_OP(data10, 8); \
158 data11 = POW2_OP(data11, 4); \
159 data20 = POW2_OP(data20, 8); \
160 data21 = POW2_OP(data21, 4); \
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100161 \
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100162 data00 = POOL_OP(data00, data10); \
163 data01 = POOL_OP(data01, data11); \
164 data00 = POOL_OP(data00, data20); \
165 data01 = POOL_OP(data01, data21); \
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100166 \
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100167 res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s147, data01.s2)); \
168 res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s25, data01.s03)); \
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100169 })
170
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100171ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
172 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100173{
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000174 int start_x = get_global_id(0) * stride_x - pad_x;
175 int start_y = get_global_id(1) * stride_y - pad_y;
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000176 const int end_x = min(start_x + pool_size_x, upper_bound_w);
177 const int end_y = min(start_y + pool_size_y, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000178#if defined(EXCLUDE_PADDING)
179 start_x = max(0, start_x);
180 start_y = max(0, start_y);
181#endif /* defined(EXCLUDE_PADDING) */
steniu010c7614f2017-06-23 17:00:26 +0100182 return ((end_y - start_y) * (end_x - start_x));
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100183}
184
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100185/** Performs a pooling function of pool size equal to 2.
186 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100187 * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100188 * @note In case of average pooling the following information must be passed at compile time:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100189 * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100190 * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
191 * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
192 * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100193 *
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100194 * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
195 * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100196 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100197 * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100198 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
199 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
200 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100201 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
202 * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
203 * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100204 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100205 * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100206 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
207 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
208 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100209 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100210 */
211__kernel void pooling_layer_2(
212 TENSOR3D_DECLARATION(input),
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100213 TENSOR3D_DECLARATION(output))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100214{
215 // Get pixels pointer
216 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
217 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
218
219 // Load data
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100220 VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
221 data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
222 VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
223 data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100224
Georgios Pinitascdf51452017-08-31 14:21:36 +0100225#if defined(POOL_L2)
226 // Raise to power of 2 for L2 Pooling
227 data0 = POW2_OP(data0, 2);
228 data1 = POW2_OP(data1, 2);
229#endif /* defined(POOL_L2) */
230
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100231 // Perform calculations
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100232 data0 = POOL_OP(data0, data1);
233 ACC_DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100234
Georgios Pinitascdf51452017-08-31 14:21:36 +0100235#if defined(POOL_AVG) || defined(POOL_L2)
236 // Divide by pool region in case of average or l2 pooling
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000237 res = DIV_OP(res, calculate_avg_scale(2, 2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100238#endif /* defined(POOL_AVG) || defined(POOL_L2) */
239
240#if defined(POOL_L2)
241 // Take square root of the result in L2 pooling
242 res = SQRT_OP(res);
243#endif /* defined(POOL_L2) */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100244
245 // Store result
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100246 *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100247}
248
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100249/** Performs a pooling function of pool size equal to 3
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100250 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100251 * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100252 * @note In case of average pooling the following information must be passed at compile time:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100253 * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100254 * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
255 * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
256 * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100257 *
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100258 * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
259 * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100260 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100261 * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100262 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
263 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
264 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100265 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
266 * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
267 * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100268 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100269 * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100270 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
271 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
272 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100273 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100274 */
275__kernel void pooling_layer_3(
276 TENSOR3D_DECLARATION(input),
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100277 TENSOR3D_DECLARATION(output))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100278{
279 // Get pixels pointer
280 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
281 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
282
283 // Load data
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100284 VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
285 data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
286 VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
287 data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
288 VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
289 data2 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100290
Georgios Pinitascdf51452017-08-31 14:21:36 +0100291#if defined(POOL_L2)
292 // Raise to power of 2 for L2 Pooling
293 data0 = POW2_OP(data0, 3);
294 data1 = POW2_OP(data1, 3);
295 data2 = POW2_OP(data2, 3);
296#endif /* defined(POOL_L2) */
297
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100298 // Perform calculations
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100299 data0 = POOL_OP(data0, data1);
300 data0 = POOL_OP(data0, data2);
301 ACC_DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100302
Georgios Pinitascdf51452017-08-31 14:21:36 +0100303#if defined(POOL_AVG) || defined(POOL_L2)
Georgios Pinitasce093142017-06-19 16:11:53 +0100304 // Divide by pool region in case of average pooling
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000305 res = DIV_OP(res, calculate_avg_scale(3, 3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100306#endif /* defined(POOL_AVG) || defined(POOL_L2) */
307
308#if defined(POOL_L2)
309 // Take square root of the result in L2 pooling
310 res = SQRT_OP(res);
311#endif /* defined(POOL_L2) */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100312
313 // Store result
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100314 *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100315}
Georgios Pinitasce093142017-06-19 16:11:53 +0100316
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100317#if defined(POOLING3x3)
steniu010c7614f2017-06-23 17:00:26 +0100318
319#define CONVERT_OP(data_type) convert_##data_type##4
320#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)
321
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100322VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
steniu010c7614f2017-06-23 17:00:26 +0100323calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,
324 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
325{
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000326 int4 start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;
327 int start_y = get_global_id(1) * stride_y - pad_y;
steniu010c7614f2017-06-23 17:00:26 +0100328 const int4 end_x = min(start_x + (int4)pool_size, (int4)upper_bound_w);
329 const int end_y = min(start_y + pool_size, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000330#if defined(EXCLUDE_PADDING)
331 start_x = max((int4)0, start_x);
332 start_y = max(0, start_y);
333#endif /* defined(EXCLUDE_PADDING) */
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100334 return (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(ACC_DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
steniu010c7614f2017-06-23 17:00:26 +0100335}
336
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100337/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
Georgios Pinitasce093142017-06-19 16:11:53 +0100338 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100339 * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100340 * @note In case of average pooling the following information must be passed at compile time:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100341 * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100342 * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
343 * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
344 * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
Georgios Pinitasce093142017-06-19 16:11:53 +0100345 *
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100346 * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
347 * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
Georgios Pinitasce093142017-06-19 16:11:53 +0100348 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100349 * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
Georgios Pinitasce093142017-06-19 16:11:53 +0100350 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
351 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
352 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100353 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
354 * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
355 * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
Georgios Pinitasce093142017-06-19 16:11:53 +0100356 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100357 * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
Georgios Pinitasce093142017-06-19 16:11:53 +0100358 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
359 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
360 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100361 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100362 */
Anton Lokhmotovaf6204c2017-11-08 09:34:19 +0000363__kernel void pooling_layer_optimized_3(
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100364 TENSOR3D_DECLARATION(input),
365 TENSOR3D_DECLARATION(output))
366{
367 // Get pixels pointer
368 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
369 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
370
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100371 VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100372 res;
373
374 // Perform pooling 3x3 for 4 output elements
375 POOLING3x3(res, input, output);
376
Georgios Pinitascdf51452017-08-31 14:21:36 +0100377#if defined(POOL_AVG) || defined(POOL_L2)
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100378 // Divide by pool region in case of average pooling
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100379 res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
Georgios Pinitascdf51452017-08-31 14:21:36 +0100380#endif /* defined(POOL_AVG) || defined(POOL_L2) */
381
382#if defined(POOL_L2)
383 // Take square root of the result in L2 pooling
384 res = SQRT_OP(res);
385#endif /* defined(POOL_L2) */
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100386
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100387 vstore4(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)output.ptr);
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100388}
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100389#endif // defined(POOLING3x3)
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100390
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000391#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100392
Michalis Spyroue74b2012018-04-18 09:49:16 +0100393/** Performs a pooling function of pool size equal to N (NCHW)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100394 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100395 * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000396 * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100397 * @note In case of average pooling the following information must be passed at compile time:
398 * -DPOOL_AVG must be provided otherwise max pooling will be performed.
399 * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
400 * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
401 * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
Michele Di Giorgiocbbed282019-12-20 13:26:08 +0000402 * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100403 *
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100404 * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
405 * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100406 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100407 * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100408 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
409 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
410 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100411 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
412 * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
413 * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100414 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100415 * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100416 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
417 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
418 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100419 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100420 */
Michalis Spyroue74b2012018-04-18 09:49:16 +0100421__kernel void pooling_layer_MxN_nchw(
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100422 TENSOR3D_DECLARATION(input),
423 TENSOR3D_DECLARATION(output))
424{
425 // Get pixels pointer
426 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
427 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
428
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100429 VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
430 vdata = INITIAL_VALUE;
431 ACC_DATA_TYPE sdata = INITIAL_VALUE;
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100432
433 // Load data
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000434 for(int y = 0; y < POOL_SIZE_Y; y++)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100435 {
436 int x = 0;
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000437 for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100438 {
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100439 VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
440 data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100441#if defined(POOL_L2)
442 // Raise to power of 2 for L2 Pooling
443 data0 *= data0;
444#endif /* defined(POOL_L2) */
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100445 vdata = POOL_OP(vdata, data0);
446 }
447
448 // Leftover
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000449 for(; x < (int)POOL_SIZE_X; ++x)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100450 {
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100451 ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100452#if defined(POOL_L2)
453 // Raise to power of 2 for L2 Pooling
454 data0 *= data0;
455#endif /* defined(POOL_L2) */
456 sdata = POOL_OP(sdata, data0);
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100457 }
458 }
459
460 // Reduce result
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100461 VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100462 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100463 VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
464 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
465 ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
466 res = POOL_OP(res, sdata);
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100467
Georgios Pinitascdf51452017-08-31 14:21:36 +0100468#if defined(POOL_AVG) || defined(POOL_L2)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100469 // Divide by pool region in case of average pooling
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000470 res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100471#endif /* defined(POOL_AVG) || defined(POOL_L2) */
472
473#if defined(POOL_L2)
474 // Take square root of the result in L2 pooling
475 res = SQRT_OP(res);
476#endif /* defined(POOL_L2) */
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100477
478 // Store result
Sang-Hoon Park2aa7fd02019-09-18 13:39:00 +0100479 *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100480}
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000481#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
Michalis Spyroue74b2012018-04-18 09:49:16 +0100482
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100483#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
484
485inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom)
486{
487 const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
488 const int pad_vert = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM;
489
490 const int x = get_global_id(0) * STRIDE_X;
491 const int y = get_global_id(1) * STRIDE_Y;
492 const int z = get_global_id(2);
493
494 //x axis: width, y axis: height, z axis: component
495 const uint padded_offset = input->offset_first_element_in_bytes
496 + x * input->stride_x
497 + y * input->stride_y
498 + z * input->stride_z;
499
500 const uint offset_base = padded_offset
501 - y * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each row */
502 - PAD_TENSOR_TOP * input->stride_y /* top padding */
503 - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */
504 - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
505
506#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT)
507 *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT));
508#else /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
509 *offset_top = (uint)(offset_base / sizeof(DATA_TYPE));
510#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
511
512 *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
513
514 return;
515}
516
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100517#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
518
519/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
520 *
521 * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
522 * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
523 * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
524 * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
525 * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
526 *
527 * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32
528 * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
529 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
530 * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
531 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
532 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
533 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
534 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
535 * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
536 * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
537 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
538 * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
539 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
540 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
541 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
542 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
543 * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32
544 * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes)
545 * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes)
546 * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes)
547 * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes)
548 * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes)
549 * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes)
550 * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
551 */
552__kernel void pooling_layer_2_nchw_indices_fp32(
553 TENSOR3D_DECLARATION(input),
554 TENSOR3D_DECLARATION(output),
555 TENSOR3D_DECLARATION(indices))
556{
557 // Get pixels pointer
558 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
559 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
560 Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
561
562 // Load data
563 float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
564 float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
565
566 // Perform calculations
567 float data0_max = POOL_OP(data0.s0, data0.s1);
568 float data1_max = POOL_OP(data1.s0, data1.s1);
569 float res = POOL_OP(data0_max, data1_max);
570 // Store result
571 *(__global float *)output.ptr = res;
572
573#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
574
575 uint offset_top = 0;
576 uint offset_bottom = 0;
577
578 offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
579
580 uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
581 uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
582 uint index = select(index1, index0, isgreaterequal(data0_max, data1_max));
583
584 *(__global uint *)indices.ptr = index;
585
586#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
587}
588
589/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
590 *
591 * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
592 * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
593 * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
594 * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
595 * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
596 *
597 * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16
598 * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
599 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
600 * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
601 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
602 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
603 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
604 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
605 * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
606 * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
607 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
608 * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
609 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
610 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
611 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
612 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
613 * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32
614 * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes)
615 * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes)
616 * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes)
617 * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes)
618 * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes)
619 * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes)
620 * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
621 */
622__kernel void pooling_layer_2_nchw_indices_fp16(
623 TENSOR3D_DECLARATION(input),
624 TENSOR3D_DECLARATION(output),
625 TENSOR3D_DECLARATION(indices))
626{
627 // Get pixels pointer
628 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
629 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
630 Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
631
632 // Load data
633 half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
634 half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
635
636 // Perform calculations
637 half data0_max = POOL_OP(data0.s0, data0.s1);
638 half data1_max = POOL_OP(data1.s0, data1.s1);
639 half res = POOL_OP(data0_max, data1_max);
640 // Store result
641 *(__global half *)output.ptr = res;
642
643#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
644
645 uint offset_top = 0;
646 uint offset_bottom = 0;
647
648 offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
649
650 uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
651 uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
652 uint index = select(index1, index0, isgreaterequal(data0_max, data1_max));
653
654 *(__global uint *)indices.ptr = index;
655
656#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
657}
658
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100659#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
660
661#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
662/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
663 * -# max, -DPOOL_MAX must be passed at compile time
664 * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
665 * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100666 *
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100667 * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
668 * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
669 * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
670 * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
671 * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
672 * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100673 * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100674 * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
675 * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
676 * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
677 * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100678 *
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100679 * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32/F16
680 * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
681 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
682 * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
683 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
684 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
685 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
686 * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
687 * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
688 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
689 * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
690 * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
691 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
692 * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
693 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
694 * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
695 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
696 * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
697 * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
698 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100699 */
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100700__kernel void pooling_layer_MxN_nhwc(
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100701 TENSOR4D_DECLARATION(input),
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100702 TENSOR4D_DECLARATION(output))
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100703{
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100704 // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
705 // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
706 int offset_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
707 int idx_out_w = get_global_id(1);
708#if DST_BATCH_SIZE != 1
709 // If batch size != 1, the batch size dimension is collapsed over the height dimension
710 int idx_out_h = get_global_id(2) % DST_HEIGHT;
711 int idx_out_n = get_global_id(2) / DST_HEIGHT;
712#else //DST_BATCH_SIZE != 1
713 int idx_out_h = get_global_id(2);
714 int idx_out_n = 0;
715#endif // DST_BATCH_SIZE != 1
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100716
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100717 int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
718 int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100719
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100720 int pool_x_s = max((int)0, -idx_in_w);
721 int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
722 int pool_y_s = max((int)0, -idx_in_h);
723 int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100724
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100725 __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes +
726 offset_c +
727 idx_out_n * input_stride_w;
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100728
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100729 __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes +
730 offset_c +
731 idx_out_w * output_stride_y +
732 idx_out_h * output_stride_z +
733 idx_out_n * output_stride_w;
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100734
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100735#if ((defined(POOL_AVG) || defined(POOL_L2)))
736#if defined(EXCLUDE_PADDING)
737 int filter_size = 0;
738#else // defined(EXCLUDE_PADDING)
739 int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
740#endif // defined(EXCLUDE_PADDING)
741#endif // ((defined(POOL_AVG) || defined(POOL_L2)))
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100742
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100743 VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
744 res0 = INITIAL_VALUE;
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100745
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100746 for(int y = pool_y_s; y < pool_y_e; ++y)
747 {
748 for(int x = pool_x_s; x < pool_x_e; ++x)
749 {
750 VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) data0;
751#if defined(FP_MIXED_PRECISION)
752 // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
753 data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
754#else // defined(FP_MIXED_PRECISION)
755 data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
756#endif // defined(FP_MIXED_PRECISION)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100757
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100758#if defined(POOL_L2)
759 // Raise to power of 2 for L2 Pooling
760 data0 *= data0;
761#endif // defined(POOL_L2)
762 res0 = POOL_OP(res0, data0);
763
764#if ((defined(POOL_AVG) || defined(POOL_L2))) && defined(EXCLUDE_PADDING)
765 filter_size++;
766#endif // ((defined(POOL_AVG) || defined(POOL_L2))) && defined(EXCLUDE_PADDING)
767 }
768 }
769
770#if defined(POOL_AVG) || defined(POOL_L2)
771 res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
772#endif // defined(POOL_AVG) || defined(POOL_L2)
773
774#if defined(POOL_L2)
775 // Take square root of the result in L2 pooling
776 res0 = SQRT_OP(res0);
777#endif // defined(POOL_L2)
778
779 // Store result
780#if defined(FP_MIXED_PRECISION)
781 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
782 STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
783#else // defined(FP_MIXED_PRECISION)
784 STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
785#endif // defined(FP_MIXED_PRECISION)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100786}
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100787#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100788
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100789/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
790 * -# max, -DPOOL_MAX must be passed at compile time
791 * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time
792 * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
793 * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100794 *
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100795 * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
796 * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
797 * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
798 * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
799 * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100800 * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100801 * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
802 * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
803 * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
804 * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100805 *
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100806 * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32/F16
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100807 * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
808 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
809 * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
810 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
811 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
812 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
813 * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
814 * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
815 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
816 * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
817 * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
818 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
819 * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
820 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
821 * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
822 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
823 * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
824 * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
825 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100826 * @param[in] indices_ptr (Optional) Pointer to the indices tensor. Supported data types: U32
827 * @param[in] indices_stride_x (Optional) Stride of the indices tensor in X dimension (in bytes)
828 * @param[in] indices_step_x (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes)
829 * @param[in] indices_stride_y (Optional) Stride of the indices tensor in Y dimension (in bytes)
830 * @param[in] indices_step_y (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes)
831 * @param[in] indices_stride_z (Optional) Stride of the indices tensor in Z dimension (in bytes)
832 * @param[in] indices_step_z (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes)
833 * @param[in] indices_stride_w (Optional) Stride of the indices tensor in W dimension (in bytes)
834 * @param[in] indices_step_w (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes)
835 * @param[in] indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100836 */
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100837__kernel void pooling_layer_2x2_nhwc(
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100838 TENSOR4D_DECLARATION(input),
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100839 TENSOR4D_DECLARATION(output)
840#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
841 ,
842 TENSOR4D_DECLARATION(indices)
843#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
844)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100845{
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100846 // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
847 // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
848 int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
849 int idx_out_w = get_global_id(1);
850#if DST_BATCH_SIZE != 1
851 // If batch size != 1, the batch size dimension is collapsed over the height dimension
852 int idx_out_h = get_global_id(2) % DST_HEIGHT;
853 int idx_out_n = get_global_id(2) / DST_HEIGHT;
854#else //SRC_BATCH_SIZE != 1
855 int idx_out_h = get_global_id(2);
856 int idx_out_n = 0;
857#endif // SRC_BATCH_SIZE != 1
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100858
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100859 int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
860 int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100861
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100862 __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes +
863 idx_out_c * sizeof(DATA_TYPE) +
864 idx_out_n * input_stride_w;
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100865
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100866 __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes +
867 idx_out_c * sizeof(DATA_TYPE) +
868 idx_out_w * output_stride_y +
869 idx_out_h * output_stride_z +
870 idx_out_n * output_stride_w;
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100871
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100872 int pool_x_s = max((int)0, -idx_in_w);
873 int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
874 int pool_y_s = max((int)0, -idx_in_h);
875 int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100876
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100877 int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100878
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100879 int x0 = pool_x_s + idx_in_w;
880 int y0 = pool_y_s + idx_in_h;
881 int x1 = pool_x_e - 1 + idx_in_w;
882 int y1 = pool_y_e - 1 + idx_in_h;
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100883
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100884 REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100885
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100886#if defined(FP_MIXED_PRECISION)
887 // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
888 data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
889 data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
890 data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
891 data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
892#else // defined(FP_MIXED_PRECISION)
893 data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
894 data1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
895 data2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
896 data3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
897#endif // defined(FP_MIXED_PRECISION)
Sheri Zhang801bbcb2020-08-03 20:11:56 +0100898
Gian Marco Iodice7333e1f2020-10-08 10:25:49 +0100899#if !defined(POOL_MAX)
900 if(filter_size != 4)
901 {
902 // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
903 data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(pool_x_e == pool_x_s));
904 data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(pool_y_e == pool_y_s));
905 data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))((pool_x_e == pool_x_s) || (pool_y_e == pool_y_s)));
906 }
907#endif // !defined(POOL_MAX)
908
909#if defined(POOL_L2)
910 // Raise to power of 2 for L2 Pooling
911 data0 *= data0;
912 data1 *= data1;
913 data2 *= data2;
914 data3 *= data3;
915#endif /* defined(POOL_L2) */
916
917 VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
918 res0 = data0;
919 res0 = POOL_OP(res0, data1);
920 res0 = POOL_OP(res0, data2);
921 res0 = POOL_OP(res0, data3);
922
923#if defined(POOL_AVG) || defined(POOL_L2)
924#if defined(EXCLUDE_PADDING)
925 res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
926#else // !defined(EXCLUDE_PADDING)
927 res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
928#endif // defined(EXCLUDE_PADDING)
929#endif // defined(POOL_AVG) || defined(POOL_L2)
930
931#if defined(POOL_L2)
932 // Take square root of the result in L2 pooling
933 res0 = SQRT_OP(res0);
934#endif // defined(POOL_L2)
935
936 // Store result
937#if defined(FP_MIXED_PRECISION)
938 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
939 STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
940#else // defined(FP_MIXED_PRECISION)
941 STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
942#endif // defined(FP_MIXED_PRECISION)
943
944#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
945
946 // This part is used to return the index of the maximum value
947 // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
948
949 // note: Batch dimension does not contribute in the offset contribution
950 VEC_DATA_TYPE(uint, VEC_SIZE) base_index = (uint)idx_out_c;
951
952 base_index += VEC_OFFS(VEC_DATA_TYPE(uint, VEC_SIZE), VEC_SIZE);
953
954 VEC_DATA_TYPE(uint, VEC_SIZE) index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
955 VEC_DATA_TYPE(uint, VEC_SIZE) index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
956 VEC_DATA_TYPE(uint, VEC_SIZE) index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
957 VEC_DATA_TYPE(uint, VEC_SIZE) index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
958
959 index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
960 index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
961 index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
962
963 __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes +
964 idx_out_c * sizeof(uint) +
965 idx_out_w * indices_stride_y +
966 idx_out_h * indices_stride_z +
967 idx_out_n * indices_stride_w;
968
969 // Store result
970 STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
971#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
972}
973#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(SELECT_DATA_TYPE) && defined(ACC_DATA_TYPE)