blob: c38a78ce3e9b632608805b54c518109b8983cba9 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Isabella Gottardia527e8c2018-01-31 17:49:25 +00002 * Copyright (c) 2017-2018 ARM Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Georgios Pinitascdf51452017-08-31 14:21:36 +010026#if defined(POOL_AVG) || defined(POOL_L2)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010027#define POOL_OP(x, y) ((x) + (y))
Georgios Pinitascdf51452017-08-31 14:21:36 +010028#else /* defined(POOL_AVG) || defined(POOL_L2) */
Anthony Barbier6ff3b192017-09-04 18:44:23 +010029#define POOL_OP(x, y) (fmax((x), (y)))
Georgios Pinitascdf51452017-08-31 14:21:36 +010030#endif /* defined(POOL_AVG) || defined(POOL_L2) */
31
32#if defined(POOL_L2)
33#define POW2_OP(x, vec_size) ((x) * (x))
34#else /* defined(POOL_L2) */
35#define POW2_OP(x, vec_size) (x)
36#endif /* defined(POOL_L2) */
Anthony Barbier6ff3b192017-09-04 18:44:23 +010037
steniu010c7614f2017-06-23 17:00:26 +010038#define DIV_OP(x, y) (x * (1.f / y))
Georgios Pinitascdf51452017-08-31 14:21:36 +010039#define SQRT_OP(x) sqrt((x))
steniu010c7614f2017-06-23 17:00:26 +010040
Michalis Spyroue74b2012018-04-18 09:49:16 +010041#define DIV_OP_NHWC(x, y) (x * (VEC_DATA_TYPE(DATA_TYPE, 8))(1.f / y))
42
Gian Marco Iodicecb292832017-08-02 13:19:48 +010043#if STRIDE_X == 1
44#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
45#elif STRIDE_X == 2 /* STRIDE_X == 1 */
46#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output)
47#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */
48#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)
49#endif /* STRIDE_X == 3 */
50
Gian Marco Iodicecb292832017-08-02 13:19:48 +010051#define POOLING3x3_STRIDE1(res, input, output) \
52 ({ \
53 VEC_DATA_TYPE(DATA_TYPE, 4) \
54 data00 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \
55 VEC_DATA_TYPE(DATA_TYPE, 2) \
56 data01 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4); \
57 VEC_DATA_TYPE(DATA_TYPE, 4) \
58 data10 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \
59 VEC_DATA_TYPE(DATA_TYPE, 2) \
60 data11 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4); \
61 VEC_DATA_TYPE(DATA_TYPE, 4) \
62 data20 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
63 VEC_DATA_TYPE(DATA_TYPE, 2) \
64 data21 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4); \
Georgios Pinitascdf51452017-08-31 14:21:36 +010065 data00 = POW2_OP(data00, 4); \
66 data01 = POW2_OP(data01, 2); \
67 data10 = POW2_OP(data10, 4); \
68 data11 = POW2_OP(data11, 2); \
69 data20 = POW2_OP(data20, 4); \
70 data21 = POW2_OP(data21, 2); \
Gian Marco Iodicecb292832017-08-02 13:19:48 +010071 \
72 VEC_DATA_TYPE(DATA_TYPE, 8) \
73 values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01212323); \
74 VEC_DATA_TYPE(DATA_TYPE, 4) \
75 values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01); \
76 VEC_DATA_TYPE(DATA_TYPE, 8) \
77 values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01212323); \
78 VEC_DATA_TYPE(DATA_TYPE, 4) \
79 values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01); \
80 VEC_DATA_TYPE(DATA_TYPE, 8) \
81 values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01212323); \
82 VEC_DATA_TYPE(DATA_TYPE, 4) \
83 values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01); \
84 \
85 values00 = POOL_OP(values00, values10); \
86 values01 = POOL_OP(values01, values11); \
87 values00 = POOL_OP(values00, values20); \
88 values01 = POOL_OP(values01, values21); \
89 \
90 res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \
91 res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03)); \
92 })
93
94#define POOLING3x3_STRIDE2(res, input, output) \
95 ({ \
96 VEC_DATA_TYPE(DATA_TYPE, 8) \
97 data00 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \
98 DATA_TYPE data01 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \
99 VEC_DATA_TYPE(DATA_TYPE, 8) \
100 data10 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \
101 DATA_TYPE data11 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \
102 VEC_DATA_TYPE(DATA_TYPE, 8) \
103 data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
104 DATA_TYPE data21 = *((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \
Georgios Pinitascdf51452017-08-31 14:21:36 +0100105 data00 = POW2_OP(data00, 8); \
106 data01 = POW2_OP(data01, 1); \
107 data10 = POW2_OP(data10, 8); \
108 data11 = POW2_OP(data11, 1); \
109 data20 = POW2_OP(data20, 8); \
110 data21 = POW2_OP(data21, 1); \
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100111 \
112 VEC_DATA_TYPE(DATA_TYPE, 8) \
113 values00 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data00.s01223445); \
114 VEC_DATA_TYPE(DATA_TYPE, 4) \
115 values01 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s667, data01); \
116 VEC_DATA_TYPE(DATA_TYPE, 8) \
117 values10 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data10.s01223445); \
118 VEC_DATA_TYPE(DATA_TYPE, 4) \
119 values11 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data10.s667, data11); \
120 VEC_DATA_TYPE(DATA_TYPE, 8) \
121 values20 = (VEC_DATA_TYPE(DATA_TYPE, 8))(data20.s01223445); \
122 VEC_DATA_TYPE(DATA_TYPE, 4) \
123 values21 = (VEC_DATA_TYPE(DATA_TYPE, 4))(data20.s667, data21); \
124 \
125 values00 = POOL_OP(values00, values10); \
126 values01 = POOL_OP(values01, values11); \
127 values00 = POOL_OP(values00, values20); \
128 values01 = POOL_OP(values01, values21); \
129 \
130 res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s147, values01.s2)); \
131 res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(values00.s25, values01.s03)); \
132 })
133
134#define POOLING3x3_STRIDE3(res, input, output) \
135 ({ \
136 VEC_DATA_TYPE(DATA_TYPE, 8) \
137 data00 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \
138 VEC_DATA_TYPE(DATA_TYPE, 4) \
139 data01 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \
140 VEC_DATA_TYPE(DATA_TYPE, 8) \
141 data10 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \
142 VEC_DATA_TYPE(DATA_TYPE, 4) \
143 data11 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \
144 VEC_DATA_TYPE(DATA_TYPE, 8) \
145 data20 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
146 VEC_DATA_TYPE(DATA_TYPE, 4) \
147 data21 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \
Georgios Pinitascdf51452017-08-31 14:21:36 +0100148 data00 = POW2_OP(data00, 8); \
149 data01 = POW2_OP(data01, 4); \
150 data10 = POW2_OP(data10, 8); \
151 data11 = POW2_OP(data11, 4); \
152 data20 = POW2_OP(data20, 8); \
153 data21 = POW2_OP(data21, 4); \
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100154 \
155 data00 = POOL_OP(data00, data10); \
156 data01 = POOL_OP(data01, data11); \
157 data00 = POOL_OP(data00, data20); \
158 data01 = POOL_OP(data01, data21); \
159 \
160 res = POOL_OP((VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s147, data01.s2)); \
161 res = POOL_OP(res, (VEC_DATA_TYPE(DATA_TYPE, 4))(data00.s25, data01.s03)); \
162 })
163
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000164DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100165 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100166{
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000167 int start_x = get_global_id(0) * stride_x - pad_x;
168 int start_y = get_global_id(1) * stride_y - pad_y;
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000169 const int end_x = min(start_x + pool_size_x, upper_bound_w);
170 const int end_y = min(start_y + pool_size_y, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000171#if defined(EXCLUDE_PADDING)
172 start_x = max(0, start_x);
173 start_y = max(0, start_y);
174#endif /* defined(EXCLUDE_PADDING) */
steniu010c7614f2017-06-23 17:00:26 +0100175 return ((end_y - start_y) * (end_x - start_x));
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100176}
177
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100178/** Performs a pooling function of pool size equal to 2.
179 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100180 * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100181 * @note In case of average pooling the following information must be passed at compile time:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100182 * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100183 * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
184 * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
185 * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100186 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100187 * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100188 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
189 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
190 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
191 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
192 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
193 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
194 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100195 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100196 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
197 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
198 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
199 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
200 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
201 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
202 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100203 */
204__kernel void pooling_layer_2(
205 TENSOR3D_DECLARATION(input),
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100206 TENSOR3D_DECLARATION(output))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100207{
208 // Get pixels pointer
209 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
210 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
211
212 // Load data
213 VEC_DATA_TYPE(DATA_TYPE, 2)
214 data0 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
215 VEC_DATA_TYPE(DATA_TYPE, 2)
216 data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
217
Georgios Pinitascdf51452017-08-31 14:21:36 +0100218#if defined(POOL_L2)
219 // Raise to power of 2 for L2 Pooling
220 data0 = POW2_OP(data0, 2);
221 data1 = POW2_OP(data1, 2);
222#endif /* defined(POOL_L2) */
223
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100224 // Perform calculations
225 data0 = POOL_OP(data0, data1);
226 DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
227
Georgios Pinitascdf51452017-08-31 14:21:36 +0100228#if defined(POOL_AVG) || defined(POOL_L2)
229 // Divide by pool region in case of average or l2 pooling
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000230 res = DIV_OP(res, calculate_avg_scale(2, 2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100231#endif /* defined(POOL_AVG) || defined(POOL_L2) */
232
233#if defined(POOL_L2)
234 // Take square root of the result in L2 pooling
235 res = SQRT_OP(res);
236#endif /* defined(POOL_L2) */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100237
238 // Store result
239 *(__global DATA_TYPE *)output.ptr = res;
240}
241
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100242/** Performs a pooling function of pool size equal to 3
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100243 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100244 * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100245 * @note In case of average pooling the following information must be passed at compile time:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100246 * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100247 * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
248 * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
249 * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100250 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100251 * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100252 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
253 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
254 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
255 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
256 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
257 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
258 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100259 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100260 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
261 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
262 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
263 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
264 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
265 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
266 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100267 */
268__kernel void pooling_layer_3(
269 TENSOR3D_DECLARATION(input),
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100270 TENSOR3D_DECLARATION(output))
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100271{
272 // Get pixels pointer
273 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
274 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
275
276 // Load data
277 VEC_DATA_TYPE(DATA_TYPE, 3)
278 data0 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
279 VEC_DATA_TYPE(DATA_TYPE, 3)
280 data1 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
281 VEC_DATA_TYPE(DATA_TYPE, 3)
282 data2 = vload3(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
283
Georgios Pinitascdf51452017-08-31 14:21:36 +0100284#if defined(POOL_L2)
285 // Raise to power of 2 for L2 Pooling
286 data0 = POW2_OP(data0, 3);
287 data1 = POW2_OP(data1, 3);
288 data2 = POW2_OP(data2, 3);
289#endif /* defined(POOL_L2) */
290
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100291 // Perform calculations
292 data0 = POOL_OP(data0, data1);
293 data0 = POOL_OP(data0, data2);
294 DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
295
Georgios Pinitascdf51452017-08-31 14:21:36 +0100296#if defined(POOL_AVG) || defined(POOL_L2)
Georgios Pinitasce093142017-06-19 16:11:53 +0100297 // Divide by pool region in case of average pooling
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000298 res = DIV_OP(res, calculate_avg_scale(3, 3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100299#endif /* defined(POOL_AVG) || defined(POOL_L2) */
300
301#if defined(POOL_L2)
302 // Take square root of the result in L2 pooling
303 res = SQRT_OP(res);
304#endif /* defined(POOL_L2) */
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100305
306 // Store result
307 *(__global DATA_TYPE *)output.ptr = res;
308}
Georgios Pinitasce093142017-06-19 16:11:53 +0100309
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100310#if defined(POOLING3x3)
steniu010c7614f2017-06-23 17:00:26 +0100311
312#define CONVERT_OP(data_type) convert_##data_type##4
313#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)
314
315VEC_DATA_TYPE(DATA_TYPE, 4)
316calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,
317 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
318{
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000319 int4 start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;
320 int start_y = get_global_id(1) * stride_y - pad_y;
steniu010c7614f2017-06-23 17:00:26 +0100321 const int4 end_x = min(start_x + (int4)pool_size, (int4)upper_bound_w);
322 const int end_y = min(start_y + pool_size, upper_bound_h);
Georgios Pinitasadaae7e2017-10-30 15:56:32 +0000323#if defined(EXCLUDE_PADDING)
324 start_x = max((int4)0, start_x);
325 start_y = max(0, start_y);
326#endif /* defined(EXCLUDE_PADDING) */
steniu010c7614f2017-06-23 17:00:26 +0100327 return (VEC_DATA_TYPE(DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
328}
329
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100330/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
Georgios Pinitasce093142017-06-19 16:11:53 +0100331 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100332 * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100333 * @note In case of average pooling the following information must be passed at compile time:
Georgios Pinitascdf51452017-08-31 14:21:36 +0100334 * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100335 * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
336 * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
337 * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
Georgios Pinitasce093142017-06-19 16:11:53 +0100338 *
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100339 * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
Georgios Pinitasce093142017-06-19 16:11:53 +0100340 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
341 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
342 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
343 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
344 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
345 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
346 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100347 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
Georgios Pinitasce093142017-06-19 16:11:53 +0100348 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
349 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
350 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
351 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
352 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
353 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
354 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100355 */
Anton Lokhmotovaf6204c2017-11-08 09:34:19 +0000356__kernel void pooling_layer_optimized_3(
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100357 TENSOR3D_DECLARATION(input),
358 TENSOR3D_DECLARATION(output))
359{
360 // Get pixels pointer
361 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
362 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
363
364 VEC_DATA_TYPE(DATA_TYPE, 4)
365 res;
366
367 // Perform pooling 3x3 for 4 output elements
368 POOLING3x3(res, input, output);
369
Georgios Pinitascdf51452017-08-31 14:21:36 +0100370#if defined(POOL_AVG) || defined(POOL_L2)
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100371 // Divide by pool region in case of average pooling
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100372 res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
Georgios Pinitascdf51452017-08-31 14:21:36 +0100373#endif /* defined(POOL_AVG) || defined(POOL_L2) */
374
375#if defined(POOL_L2)
376 // Take square root of the result in L2 pooling
377 res = SQRT_OP(res);
378#endif /* defined(POOL_L2) */
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100379
380 vstore4(res, 0, (__global DATA_TYPE *)output.ptr);
381}
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100382#endif // defined(POOLING3x3)
Gian Marco Iodicecb292832017-08-02 13:19:48 +0100383
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000384#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100385
386// Set the initial value for the pooling operation accordingly with the data type
Georgios Pinitascdf51452017-08-31 14:21:36 +0100387#if defined(POOL_AVG) || defined(POOL_L2)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100388#define INITIAL_VALUE 0
Georgios Pinitascdf51452017-08-31 14:21:36 +0100389#else /* defined(POOL_AVG) || defined(POOL_L2) */
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100390#if FP16
391#define INITIAL_VALUE -HALF_MAX
392#else // FP16
393#define INITIAL_VALUE -FLT_MAX
394#endif // FP16
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100395
396#endif // POOL_AVG
397
Michalis Spyroue74b2012018-04-18 09:49:16 +0100398/** Performs a pooling function of pool size equal to N (NCHW)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100399 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100400 * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100401 * @note -DFP16 must be passed at compile time if half float data type is used
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000402 * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100403 * @note In case of average pooling the following information must be passed at compile time:
404 * -DPOOL_AVG must be provided otherwise max pooling will be performed.
405 * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
406 * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
407 * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
408 *
Vidhya Sudhan Loganathan7485d5a2018-07-04 09:34:00 +0100409 * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100410 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
411 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
412 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
413 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
414 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
415 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
416 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
417 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
418 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
419 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
420 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
421 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
422 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
423 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
424 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
425 */
Michalis Spyroue74b2012018-04-18 09:49:16 +0100426__kernel void pooling_layer_MxN_nchw(
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100427 TENSOR3D_DECLARATION(input),
428 TENSOR3D_DECLARATION(output))
429{
430 // Get pixels pointer
431 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
432 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
433
434 VEC_DATA_TYPE(DATA_TYPE, 8)
435 vdata = INITIAL_VALUE;
436 DATA_TYPE sdata = INITIAL_VALUE;
437
438 // Load data
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000439 for(int y = 0; y < POOL_SIZE_Y; y++)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100440 {
441 int x = 0;
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000442 for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100443 {
444 VEC_DATA_TYPE(DATA_TYPE, 8)
445 data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100446#if defined(POOL_L2)
447 // Raise to power of 2 for L2 Pooling
448 data0 *= data0;
449#endif /* defined(POOL_L2) */
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100450 vdata = POOL_OP(vdata, data0);
451 }
452
453 // Leftover
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000454 for(; x < (int)POOL_SIZE_X; ++x)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100455 {
456 DATA_TYPE data0 = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100457#if defined(POOL_L2)
458 // Raise to power of 2 for L2 Pooling
459 data0 *= data0;
460#endif /* defined(POOL_L2) */
461 sdata = POOL_OP(sdata, data0);
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100462 }
463 }
464
465 // Reduce result
466 VEC_DATA_TYPE(DATA_TYPE, 4)
467 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
468 VEC_DATA_TYPE(DATA_TYPE, 2)
469 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
470 DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
471 res = POOL_OP(res, sdata);
472
Georgios Pinitascdf51452017-08-31 14:21:36 +0100473#if defined(POOL_AVG) || defined(POOL_L2)
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100474 // Divide by pool region in case of average pooling
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000475 res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
Georgios Pinitascdf51452017-08-31 14:21:36 +0100476#endif /* defined(POOL_AVG) || defined(POOL_L2) */
477
478#if defined(POOL_L2)
479 // Take square root of the result in L2 pooling
480 res = SQRT_OP(res);
481#endif /* defined(POOL_L2) */
Gian Marco Iodicebf179552017-09-05 13:51:21 +0100482
483 // Store result
484 *(__global DATA_TYPE *)output.ptr = res;
485}
Isabella Gottardia527e8c2018-01-31 17:49:25 +0000486#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
Michalis Spyroue74b2012018-04-18 09:49:16 +0100487
488DATA_TYPE calculate_avg_scale_nhwc(const int pool_size_x, const int pool_size_y, int upper_bound_w, int upper_bound_h,
489 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
490{
491 int start_x = get_global_id(1) * stride_x - pad_x;
492 int start_y = get_global_id(2) * stride_y - pad_y;
493
494#if !defined(EXCLUDE_PADDING)
495 upper_bound_w += pad_x;
496 upper_bound_h += pad_y;
497#endif /* defined(EXCLUDE_PADDING) */
498 const int end_x = min(start_x + pool_size_x, upper_bound_w);
499 const int end_y = min(start_y + pool_size_y, upper_bound_h);
500#if defined(EXCLUDE_PADDING)
501 start_x = max(0, start_x);
502 start_y = max(0, start_y);
503#endif /* defined(EXCLUDE_PADDING) */
504 return ((end_y - start_y) * (end_x - start_x));
505}
506
507/** Performs a pooling function of pool size equal to N (NHWC)
508 *
509 * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32
510 * @note -DFP16 must be passed at compile time if half float data type is used
511 * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
512 * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
513 * @note Strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
514 * @note Pad values must be passed at compile time using -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
515 * @note In case of average pooling the following information must be passed at compile time:
516 * -DPOOL_AVG must be provided otherwise max pooling will be performed.
517 *
518 * @param[in] input_ptr Pointer to the source image. Supported data types: F16/F32
519 * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
520 * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
521 * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
522 * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
523 * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
524 * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
525 * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
526 * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
527 * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
528 * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
529 * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
530 * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
531 * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
532 * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
533 * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
534 */
535__kernel void pooling_layer_MxN_nhwc(
536 TENSOR3D_DECLARATION(input),
537 TENSOR3D_DECLARATION(output))
538{
539 // Get pixels pointer
540 Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
541 Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
542
543 VEC_DATA_TYPE(DATA_TYPE, 8)
544 vdata = INITIAL_VALUE;
545 DATA_TYPE sdata = INITIAL_VALUE;
546
547 const int idx_width = get_global_id(1) * STRIDE_X;
548 const int idx_height = get_global_id(2) * STRIDE_Y;
549
550 for(int y = 0; y < POOL_SIZE_Y; ++y)
551 {
552 int y1 = select(y, PAD_Y - idx_height, y + idx_height < PAD_Y || y + idx_height > MAX_HEIGHT);
553 for(int x = 0; x < POOL_SIZE_X; ++x)
554 {
555 int x1 = select(x, PAD_X - idx_width - 1, x + idx_width < PAD_X || x + idx_width > MAX_WIDTH);
556 x1 = select(x1, PAD_X - idx_width - 1, y != y1);
557
558 VEC_DATA_TYPE(DATA_TYPE, 8)
559 data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, x1 - PAD_X, y1 - PAD_Y));
560#if defined(POOL_L2)
561 // Raise to power of 2 for L2 Pooling
562 data0 *= data0;
563#endif /* defined(POOL_L2) */
564 vdata = POOL_OP(vdata, data0);
565 }
566 }
567
568#if defined(POOL_AVG) || defined(POOL_L2)
569 // Divide by pool region in case of average pooling
570 vdata = DIV_OP_NHWC(vdata, calculate_avg_scale_nhwc(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
571#endif /* defined(POOL_AVG) || defined(POOL_L2) */
572
573#if defined(POOL_L2)
574 // Take square root of the result in L2 pooling
575 vdata = SQRT_OP(vdata);
576#endif /* defined(POOL_L2) */
577
578 // Store result
579 vstore8(vdata, 0, (__global DATA_TYPE *)output.ptr);
580}