blob: fae15b2347ec2728cb174e3516e21ca256d794d4 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Giorgio Arenae36208c2021-01-21 14:53:56 +00002 * Copyright (c) 2016-2021 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#ifndef ARM_COMPUTE_HELPER_H
25#define ARM_COMPUTE_HELPER_H
26
Giorgio Arenad304adb2020-10-02 10:20:11 +010027#include "load_store_utility.h"
28
Georgios Pinitasdaa38552018-08-28 17:43:18 +010029#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010030#pragma OPENCL EXTENSION cl_khr_fp16 : enable
Georgios Pinitasdaa38552018-08-28 17:43:18 +010031#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
Matthew Bentham6f31f8c2017-10-27 11:50:06 +010032
Georgios Pinitasdaa38552018-08-28 17:43:18 +010033#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Michalis Spyroue03342e2018-01-15 14:39:13 +000034#pragma OPENCL EXTENSION cl_arm_integer_dot_product_int8 : enable
Georgios Pinitasdaa38552018-08-28 17:43:18 +010035#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
Michalis Spyroue03342e2018-01-15 14:39:13 +000036
Georgios Pinitasdaa38552018-08-28 17:43:18 +010037#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Giorgio Arenaeff8d952018-07-02 15:29:57 +010038#pragma OPENCL EXTENSION cl_arm_integer_dot_product_accumulate_int8 : enable
Georgios Pinitasdaa38552018-08-28 17:43:18 +010039#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
Giorgio Arenaeff8d952018-07-02 15:29:57 +010040
Georgios Pinitasdaa38552018-08-28 17:43:18 +010041#if defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
steniu01f01f9de2017-09-27 17:00:11 +010042#pragma OPENCL EXTENSION cl_arm_printf : enable
Georgios Pinitas238c97c2018-08-31 17:28:29 +010043#endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
Anthony Barbier6ff3b192017-09-04 18:44:23 +010044
Usama Arife2428a02019-05-09 11:03:17 +010045#define GPU_ARCH_MIDGARD 0x100
46#define GPU_ARCH_BIFROST 0x200
47
Sang-Hoon Parkbfd75d62019-10-30 14:56:17 +000048/** Concatenate two inputs.
49 *
50 * @param[in] a The first input to be concatenated
51 * @param[in] b The second input to be concatenated
52 *
53 * @return The concatenated output
54 */
Gian Marco Iodice43a129e2019-05-14 10:14:08 +010055#define CONCAT(a, b) a##b
56
Sang-Hoon Parkbfd75d62019-10-30 14:56:17 +000057/** Expand the given vector
58 *
59 * @param[in] x The vector to be expanded
60 *
61 * @return The expanded output
62 */
Georgios Pinitase5f8fd62017-06-23 18:03:44 +010063#define EXPAND(x) x
64
Sang-Hoon Parkbfd75d62019-10-30 14:56:17 +000065/** Clamp the given value between an upper and lower bound.
66 *
67 * @param[in] x The value to be clamped
68 * @param[in] min_val The lower bound
69 * @param[in] max_val The upper bound
70 *
71 * @return The clamped value.
72 */
Anthony Barbier6ff3b192017-09-04 18:44:23 +010073#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
74
Sang-Hoon Parkbfd75d62019-10-30 14:56:17 +000075/** REVn reverses the given vector whose size is n.
76 * @name REVn
77 *
78 * @param[in] x The vector to be reversed
79 *
80 * @return The reversed vector
81 * @{
82 */
Giorgio Arena5c4a8e92019-08-28 17:55:07 +010083#define REV1(x) ((x))
84#define REV2(x) ((x).s10)
85#define REV3(x) ((x).s210)
86#define REV4(x) ((x).s3210)
87#define REV8(x) ((x).s76543210)
88#define REV16(x) ((x).sFEDCBA9876543210)
Sang-Hoon Parkbfd75d62019-10-30 14:56:17 +000089/** @} */ // end of group REVn
Giorgio Arena5c4a8e92019-08-28 17:55:07 +010090
Sang-Hoon Parkbfd75d62019-10-30 14:56:17 +000091/** Reverse the given vector.
92 * @name REVERSE
93 *
94 * @param[in] x The vector to be reversed
95 * @param[in] s The size of the vector
96 *
97 * @return The reversed vector
98 * @{
99 */
Giorgio Arena5c4a8e92019-08-28 17:55:07 +0100100#define REVERSE_STR(x, s) REV##s((x))
101#define REVERSE(x, s) REVERSE_STR(x, s)
Sang-Hoon Parkbfd75d62019-10-30 14:56:17 +0000102/** @} */ // end of group REVERSE
Giorgio Arena5c4a8e92019-08-28 17:55:07 +0100103
Sang-Hoon Parkbfd75d62019-10-30 14:56:17 +0000104/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
105 * @name ROTs_n
106 *
107 * @param[in] x The vector to be shifted
108 *
109 * @return The shifted vector
110 * @{
111 */
Giorgio Arena5c4a8e92019-08-28 17:55:07 +0100112#define ROT1_0(x) ((x))
Giorgio Arena93240222020-12-23 11:55:29 +0000113#define ROT1_1(x) ((x))
Giorgio Arena5c4a8e92019-08-28 17:55:07 +0100114
115#define ROT2_0(x) ((x))
116#define ROT2_1(x) ((x).s10)
Giorgio Arena93240222020-12-23 11:55:29 +0000117#define ROT2_2(x) ((x))
Giorgio Arena5c4a8e92019-08-28 17:55:07 +0100118
119#define ROT3_0(x) ((x))
120#define ROT3_1(x) ((x).s201)
121#define ROT3_2(x) ((x).s120)
Giorgio Arena93240222020-12-23 11:55:29 +0000122#define ROT3_3(x) ((x))
Giorgio Arena5c4a8e92019-08-28 17:55:07 +0100123
124#define ROT4_0(x) ((x))
125#define ROT4_1(x) ((x).s3012)
126#define ROT4_2(x) ((x).s2301)
127#define ROT4_3(x) ((x).s1230)
Giorgio Arena93240222020-12-23 11:55:29 +0000128#define ROT4_4(x) ((x))
Giorgio Arena5c4a8e92019-08-28 17:55:07 +0100129
130#define ROT8_0(x) ((x))
131#define ROT8_1(x) ((x).s70123456)
132#define ROT8_2(x) ((x).s67012345)
133#define ROT8_3(x) ((x).s56701234)
134#define ROT8_4(x) ((x).s45670123)
135#define ROT8_5(x) ((x).s34567012)
136#define ROT8_6(x) ((x).s23456701)
137#define ROT8_7(x) ((x).s12345670)
Giorgio Arena93240222020-12-23 11:55:29 +0000138#define ROT8_8(x) ((x))
Giorgio Arena5c4a8e92019-08-28 17:55:07 +0100139
140#define ROT16_0(x) ((x))
141#define ROT16_1(x) ((x).sF0123456789ABCDE)
142#define ROT16_2(x) ((x).sEF0123456789ABCD)
143#define ROT16_3(x) ((x).sDEF0123456789ABC)
144#define ROT16_4(x) ((x).sCDEF0123456789AB)
145#define ROT16_5(x) ((x).sBCDEF0123456789A)
146#define ROT16_6(x) ((x).sABCDEF0123456789)
147#define ROT16_7(x) ((x).s9ABCDEF012345678)
148#define ROT16_8(x) ((x).s89ABCDEF01234567)
149#define ROT16_9(x) ((x).s789ABCDEF0123456)
150#define ROT16_10(x) ((x).s6789ABCDEF012345)
151#define ROT16_11(x) ((x).s56789ABCDEF01234)
152#define ROT16_12(x) ((x).s456789ABCDEF0123)
153#define ROT16_13(x) ((x).s3456789ABCDEF012)
154#define ROT16_14(x) ((x).s23456789ABCDEF01)
155#define ROT16_15(x) ((x).s123456789ABCDEF0)
Giorgio Arena93240222020-12-23 11:55:29 +0000156#define ROT16_16(x) ((x))
Sang-Hoon Parkbfd75d62019-10-30 14:56:17 +0000157/** @} */ // end of group ROTs_n
Giorgio Arena5c4a8e92019-08-28 17:55:07 +0100158
Sang-Hoon Parkbfd75d62019-10-30 14:56:17 +0000159/** Circular-right-shift (rotate-right) the given vector by the given amount.
160 * @name ROTATE
161 *
162 * @param[in] x The vector to be shifted
163 * @param[in] s The size of the vector
164 * @param[in] n The amount to be shifted
165 *
166 * @return The shifted vector
167 * @{
168 */
Giorgio Arena5c4a8e92019-08-28 17:55:07 +0100169#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
170#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
Sang-Hoon Parkbfd75d62019-10-30 14:56:17 +0000171/** @} */ // end of group ROTATE
Giorgio Arena5c4a8e92019-08-28 17:55:07 +0100172
Sang-Hoon Parkbfd75d62019-10-30 14:56:17 +0000173/** Creates a vector of size n filled with offset values corresponding to the location of each element.
174 * @name V_OFFSn
175 *
176 * @param[in] dt The data type of the output vector
177 *
178 * @return The vector filled with offset values
179 * @{
180 */
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000181#define V_OFFS1(dt) (dt##1)(0)
182#define V_OFFS2(dt) (dt##2)(0, 1)
183#define V_OFFS3(dt) (dt##3)(0, 1, 2)
184#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
185#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
186#define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
Sang-Hoon Parkbfd75d62019-10-30 14:56:17 +0000187/** @} */ // end of group V_OFFSn
Giorgio Arena5c4a8e92019-08-28 17:55:07 +0100188
Sang-Hoon Parkbfd75d62019-10-30 14:56:17 +0000189/** Create a vector filled with offset values corresponding to the location of each element.
190 * @name VEC_OFFS
191 *
192 * @param[in] dt The data type of the output vector
193 * @param[in] s The size of the output vector
194 *
195 * @return The vector filled with offset values
196 * @{
197 */
Giorgio Arena5c4a8e92019-08-28 17:55:07 +0100198#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
199#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
Sang-Hoon Parkbfd75d62019-10-30 14:56:17 +0000200/** @} */ // end of group VEC_OFFS
Giorgio Arena5c4a8e92019-08-28 17:55:07 +0100201
Georgios Pinitasac4e8732017-07-05 17:02:25 +0100202#define VLOAD_STR(size) vload##size
203#define VLOAD(size) VLOAD_STR(size)
204
Giorgio Arenabde2f352021-09-07 14:15:28 +0100205/** Extended partial vload that correctly handles scalar values as well.
206 * Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of load ops
207 * @name VLOAD_PARTIAL
208 *
209 * @note With this macro, the passed data can be both a vector and a scalar
210 * @note @p load_size needs to be <= @p size
211 * eg 1: Valid
212 * VLOAD_PARTIAL(16, 15) ...;
213 * eg 2: Invalid
214 * VLOAD_PARTIAL(4, 7) ...;
215 *
216 * @param[in] size The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16
217 * @param[in] load_size The number of lower elements to load. Supported values: 1-16, but has to be <= @p size
218 * @{
219 */
220#define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
221#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
222
223#define NO_LOAD(data, offs, ptr) \
224 { \
225 }
226
227// Size == 1 (scalar)
228#define vload_partial_1_0 NO_LOAD
229#define vload_partial_1_1 vload1
230#define vload_partial_1_2 NO_LOAD
231#define vload_partial_1_3 NO_LOAD
232#define vload_partial_1_4 NO_LOAD
233#define vload_partial_1_5 NO_LOAD
234#define vload_partial_1_6 NO_LOAD
235#define vload_partial_1_7 NO_LOAD
236#define vload_partial_1_8 NO_LOAD
237#define vload_partial_1_9 NO_LOAD
238#define vload_partial_1_10 NO_LOAD
239#define vload_partial_1_11 NO_LOAD
240#define vload_partial_1_12 NO_LOAD
241#define vload_partial_1_13 NO_LOAD
242#define vload_partial_1_14 NO_LOAD
243#define vload_partial_1_15 NO_LOAD
244#define vload_partial_1_16 NO_LOAD
245// Size == 2
246#define vload_partial_2_0 NO_LOAD
247#define vload_partial_2_1 vload_partial_1
248#define vload_partial_2_2 vload_partial_2
249#define vload_partial_2_3 NO_LOAD
250#define vload_partial_2_4 NO_LOAD
251#define vload_partial_2_5 NO_LOAD
252#define vload_partial_2_6 NO_LOAD
253#define vload_partial_2_7 NO_LOAD
254#define vload_partial_2_8 NO_LOAD
255#define vload_partial_2_9 NO_LOAD
256#define vload_partial_2_10 NO_LOAD
257#define vload_partial_2_11 NO_LOAD
258#define vload_partial_2_12 NO_LOAD
259#define vload_partial_2_13 NO_LOAD
260#define vload_partial_2_14 NO_LOAD
261#define vload_partial_2_15 NO_LOAD
262#define vload_partial_2_16 NO_LOAD
263// Size == 3
264#define vload_partial_3_0 NO_LOAD
265#define vload_partial_3_1 vload_partial_1
266#define vload_partial_3_2 vload_partial_2
267#define vload_partial_3_3 vload_partial_3
268#define vload_partial_3_4 NO_LOAD
269#define vload_partial_3_5 NO_LOAD
270#define vload_partial_3_6 NO_LOAD
271#define vload_partial_3_7 NO_LOAD
272#define vload_partial_3_8 NO_LOAD
273#define vload_partial_3_9 NO_LOAD
274#define vload_partial_3_10 NO_LOAD
275#define vload_partial_3_11 NO_LOAD
276#define vload_partial_3_12 NO_LOAD
277#define vload_partial_3_13 NO_LOAD
278#define vload_partial_3_14 NO_LOAD
279#define vload_partial_3_15 NO_LOAD
280#define vload_partial_3_16 NO_LOAD
281// Size == 4
282#define vload_partial_4_0 NO_LOAD
283#define vload_partial_4_1 vload_partial_1
284#define vload_partial_4_2 vload_partial_2
285#define vload_partial_4_3 vload_partial_3
286#define vload_partial_4_4 vload_partial_4
287#define vload_partial_4_5 NO_LOAD
288#define vload_partial_4_6 NO_LOAD
289#define vload_partial_4_7 NO_LOAD
290#define vload_partial_4_8 NO_LOAD
291#define vload_partial_4_9 NO_LOAD
292#define vload_partial_4_10 NO_LOAD
293#define vload_partial_4_11 NO_LOAD
294#define vload_partial_4_12 NO_LOAD
295#define vload_partial_4_13 NO_LOAD
296#define vload_partial_4_14 NO_LOAD
297#define vload_partial_4_15 NO_LOAD
298#define vload_partial_4_16 NO_LOAD
299// Size == 8
300#define vload_partial_8_0 NO_LOAD
301#define vload_partial_8_1 vload_partial_1
302#define vload_partial_8_2 vload_partial_2
303#define vload_partial_8_3 vload_partial_3
304#define vload_partial_8_4 vload_partial_4
305#define vload_partial_8_5 vload_partial_5
306#define vload_partial_8_6 vload_partial_6
307#define vload_partial_8_7 vload_partial_7
308#define vload_partial_8_8 vload_partial_8
309#define vload_partial_8_9 NO_LOAD
310#define vload_partial_8_10 NO_LOAD
311#define vload_partial_8_11 NO_LOAD
312#define vload_partial_8_12 NO_LOAD
313#define vload_partial_8_13 NO_LOAD
314#define vload_partial_8_14 NO_LOAD
315#define vload_partial_8_15 NO_LOAD
316#define vload_partial_8_16 NO_LOAD
317// Size == 16
318#define vload_partial_16_0 NO_LOAD
319#define vload_partial_16_1 vload_partial_1
320#define vload_partial_16_2 vload_partial_2
321#define vload_partial_16_3 vload_partial_3
322#define vload_partial_16_4 vload_partial_4
323#define vload_partial_16_5 vload_partial_5
324#define vload_partial_16_6 vload_partial_6
325#define vload_partial_16_7 vload_partial_7
326#define vload_partial_16_8 vload_partial_8
327#define vload_partial_16_9 vload_partial_9
328#define vload_partial_16_10 vload_partial_10
329#define vload_partial_16_11 vload_partial_11
330#define vload_partial_16_12 vload_partial_12
331#define vload_partial_16_13 vload_partial_13
332#define vload_partial_16_14 vload_partial_14
333#define vload_partial_16_15 vload_partial_15
334#define vload_partial_16_16 vload_partial_16
335
336/** Partial vload. Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vload ops
337 * @name vload_partial_n
338 *
339 * @note @p DATA needs to be a vector not a scalar
340 * @note n needs to be <= the vector width of the input variable @p DATA
341 * eg 1: Valid
342 * vload_partial_15(var:float16, 0, 0xabcd);
343 * eg 2: Invalid
344 * vload_partial_7(var:float4, 0, 0xabcd);
345 *
346 * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vload is invoked, thus there's no performance penalty.
347 *
348 * @param[in] DATA The name of the variable where to load the values
349 * @param[in] OFFSET Offset in n
350 * @param[in] PTR The base pointer
351 * @{
352 */
353#define vload_partial_1(DATA, OFFSET, PTR) \
354 DATA.s0 = vload1(OFFSET, PTR);
355
356#define vload_partial_2(DATA, OFFSET, PTR) \
357 DATA.s01 = vload2(OFFSET, PTR);
358
359#define vload_partial_3(DATA, OFFSET, PTR) \
360 DATA.s012 = vload3(OFFSET, PTR);
361
362#define vload_partial_4(DATA, OFFSET, PTR) \
363 DATA.s0123 = vload4(OFFSET, PTR);
364
365#define vload_partial_5(DATA, OFFSET, PTR) \
366 DATA.s0123 = vload_partial_4(DATA, OFFSET, PTR); \
367 DATA.s4 = vload1(OFFSET, PTR + 4);
368
369#define vload_partial_6(DATA, OFFSET, PTR) \
370 DATA.s0123 = vload_partial_4(DATA, OFFSET, PTR); \
371 DATA.s45 = vload_partial_2(DATA, OFFSET, PTR + 4);
372
373#define vload_partial_7(DATA, OFFSET, PTR) \
374 DATA.s0123 = vload_partial_4(DATA, OFFSET, PTR); \
375 DATA.s456 = vload_partial_3(DATA, OFFSET, PTR + 4);
376
377#define vload_partial_8(DATA, OFFSET, PTR) \
378 DATA.s01234567 = vload8(OFFSET, PTR);
379
380#define vload_partial_9(DATA, OFFSET, PTR) \
381 DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \
382 DATA.s8 = vload1(OFFSET, PTR + 8);
383
384#define vload_partial_10(DATA, OFFSET, PTR) \
385 DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \
386 DATA.s89 = vload_partial_2(DATA, OFFSET, PTR + 8);
387
388#define vload_partial_11(DATA, OFFSET, PTR) \
389 DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \
390 DATA.s89A = vload_partial_3(DATA, OFFSET, PTR + 8);
391
392#define vload_partial_12(DATA, OFFSET, PTR) \
393 DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \
394 DATA.s89AB = vload_partial_4(DATA, OFFSET, PTR + 8);
395
396#define vload_partial_13(DATA, OFFSET, PTR) \
397 DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \
398 DATA.s89ABC = vload_partial_5(DATA, OFFSET, PTR + 8);
399
400#define vload_partial_14(DATA, OFFSET, PTR) \
401 DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \
402 DATA.s89ABCD = vload_partial_6(DATA, OFFSET, PTR + 8);
403
404#define vload_partial_15(DATA, OFFSET, PTR) \
405 DATA.s01234567 = vload_partial_8(DATA, OFFSET, PTR); \
406 DATA.s89ABCDE = vload_partial_7(DATA, OFFSET, PTR + 8);
407
408#define vload_partial_16(DATA, OFFSET, PTR) \
409 DATA = vload16(OFFSET, PTR);
410/** @} */ // end of groupd vload_partial_n
411/** @} */ // end of groupd VLOAD_PARTIAL
412
Gian Marco Iodicee3a849a2020-06-10 17:59:30 +0100413#define PIXEL_UNIT4 1
414#define PIXEL_UNIT8 2
415#define PIXEL_UNIT16 4
416
417/** Utility macro to convert a vector size in pixel unit.
418 *
419 * @name CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
420 *
421 * @param[in] vec_size Vector size. Only 4,8 and 16 is supported
422 *
423 * @return The pixel unit (number of pixels)
424 * @{
425 */
426#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
427#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
428/** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
429
430#define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
431#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
432#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
433
434#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
435#define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
436#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
437#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
438#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
439
440/** Utility macro to read a 2D OpenCL image object.
441 *
442 * @note Coordinates are not normalized
443 *
444 * @param[in] data_type Data type
445 * @param[in] n0 Number of pixel to read. Only 1,2 and 4 is supported
446 * @param[in] img OpenCL image object
447 * @param[in] x_coord The x coordinate for the top-left pixel
448 * @param[in] y_coord The y coordinate for the top-left pixel
449 *
450 * @return Pixels from the 2D OpenCL image object
451 * @{
452 */
453#define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
454#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
455
Georgios Pinitasac4e8732017-07-05 17:02:25 +0100456#define VSTORE_STR(size) vstore##size
457#define VSTORE(size) VSTORE_STR(size)
458
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100459#define float1 float
460#define half1 half
Usama Arif0681e3b2019-04-25 14:28:07 +0100461#define char1 char
462#define uchar1 uchar
463#define short1 short
464#define ushort1 ushort
465#define int1 int
466#define uint1 uint
467#define long1 long
468#define ulong1 ulong
469#define double1 double
470
471#define vload1(OFFSET, PTR) *(OFFSET + PTR)
Gian Marco Iodice9285adb2019-09-05 16:10:27 +0100472#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
Manuel Bottini0d0028c2018-10-02 16:41:52 +0100473
SiCong Li3a501662020-06-26 10:02:06 +0100474/** Extended partial vstore that correctly handles scalar values as well.
475 * Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
476 * @name VSTORE_PARTIAL
477 *
478 * @note With this macro, the passed data can be both a vector and a scalar
479 * @note @p store_size needs to be <= @p size
480 * eg 1: Valid
481 * VSTORE_PARTIAL(16, 15) ...;
482 * eg 2: Invalid
483 * VSTORE_PARTIAL(4, 7) ...;
484 *
485 * @param[in] size The width of @p DATA. Supported values: 1(scalar), 2, 3, 4, 8, 16
486 * @param[in] store_size The number of lower elements to store. Supported values: 1-16, but has to be <= @p size
487 * @{
488 */
489#define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
490#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
491
Giorgio Arenad304adb2020-10-02 10:20:11 +0100492#define NO_STORE(data, offs, ptr) \
493 { \
494 }
495
SiCong Li3a501662020-06-26 10:02:06 +0100496// Size == 1 (scalar)
Giorgio Arenad304adb2020-10-02 10:20:11 +0100497#define vstore_partial_1_0 NO_STORE
SiCong Li3a501662020-06-26 10:02:06 +0100498#define vstore_partial_1_1 vstore1
SiCong Li0ea50e32020-11-05 09:18:11 +0000499#define vstore_partial_1_2 NO_STORE
500#define vstore_partial_1_3 NO_STORE
501#define vstore_partial_1_4 NO_STORE
502#define vstore_partial_1_5 NO_STORE
503#define vstore_partial_1_6 NO_STORE
504#define vstore_partial_1_7 NO_STORE
505#define vstore_partial_1_8 NO_STORE
506#define vstore_partial_1_9 NO_STORE
507#define vstore_partial_1_10 NO_STORE
508#define vstore_partial_1_11 NO_STORE
509#define vstore_partial_1_12 NO_STORE
510#define vstore_partial_1_13 NO_STORE
511#define vstore_partial_1_14 NO_STORE
512#define vstore_partial_1_15 NO_STORE
513#define vstore_partial_1_16 NO_STORE
SiCong Li3a501662020-06-26 10:02:06 +0100514// Size == 2
Giorgio Arenad304adb2020-10-02 10:20:11 +0100515#define vstore_partial_2_0 NO_STORE
SiCong Li3a501662020-06-26 10:02:06 +0100516#define vstore_partial_2_1 vstore_partial_1
517#define vstore_partial_2_2 vstore_partial_2
SiCong Li0ea50e32020-11-05 09:18:11 +0000518#define vstore_partial_2_3 NO_STORE
519#define vstore_partial_2_4 NO_STORE
520#define vstore_partial_2_5 NO_STORE
521#define vstore_partial_2_6 NO_STORE
522#define vstore_partial_2_7 NO_STORE
523#define vstore_partial_2_8 NO_STORE
524#define vstore_partial_2_9 NO_STORE
525#define vstore_partial_2_10 NO_STORE
526#define vstore_partial_2_11 NO_STORE
527#define vstore_partial_2_12 NO_STORE
528#define vstore_partial_2_13 NO_STORE
529#define vstore_partial_2_14 NO_STORE
530#define vstore_partial_2_15 NO_STORE
531#define vstore_partial_2_16 NO_STORE
SiCong Li3a501662020-06-26 10:02:06 +0100532// Size == 3
Giorgio Arenad304adb2020-10-02 10:20:11 +0100533#define vstore_partial_3_0 NO_STORE
SiCong Li3a501662020-06-26 10:02:06 +0100534#define vstore_partial_3_1 vstore_partial_1
535#define vstore_partial_3_2 vstore_partial_2
536#define vstore_partial_3_3 vstore_partial_3
SiCong Li0ea50e32020-11-05 09:18:11 +0000537#define vstore_partial_3_4 NO_STORE
538#define vstore_partial_3_5 NO_STORE
539#define vstore_partial_3_6 NO_STORE
540#define vstore_partial_3_7 NO_STORE
541#define vstore_partial_3_8 NO_STORE
542#define vstore_partial_3_9 NO_STORE
543#define vstore_partial_3_10 NO_STORE
544#define vstore_partial_3_11 NO_STORE
545#define vstore_partial_3_12 NO_STORE
546#define vstore_partial_3_13 NO_STORE
547#define vstore_partial_3_14 NO_STORE
548#define vstore_partial_3_15 NO_STORE
549#define vstore_partial_3_16 NO_STORE
SiCong Li3a501662020-06-26 10:02:06 +0100550// Size == 4
Giorgio Arenad304adb2020-10-02 10:20:11 +0100551#define vstore_partial_4_0 NO_STORE
SiCong Li3a501662020-06-26 10:02:06 +0100552#define vstore_partial_4_1 vstore_partial_1
553#define vstore_partial_4_2 vstore_partial_2
554#define vstore_partial_4_3 vstore_partial_3
555#define vstore_partial_4_4 vstore_partial_4
SiCong Li0ea50e32020-11-05 09:18:11 +0000556#define vstore_partial_4_5 NO_STORE
557#define vstore_partial_4_6 NO_STORE
558#define vstore_partial_4_7 NO_STORE
559#define vstore_partial_4_8 NO_STORE
560#define vstore_partial_4_9 NO_STORE
561#define vstore_partial_4_10 NO_STORE
562#define vstore_partial_4_11 NO_STORE
563#define vstore_partial_4_12 NO_STORE
564#define vstore_partial_4_13 NO_STORE
565#define vstore_partial_4_14 NO_STORE
566#define vstore_partial_4_15 NO_STORE
567#define vstore_partial_4_16 NO_STORE
SiCong Li3a501662020-06-26 10:02:06 +0100568// Size == 8
Giorgio Arenad304adb2020-10-02 10:20:11 +0100569#define vstore_partial_8_0 NO_STORE
SiCong Li3a501662020-06-26 10:02:06 +0100570#define vstore_partial_8_1 vstore_partial_1
571#define vstore_partial_8_2 vstore_partial_2
572#define vstore_partial_8_3 vstore_partial_3
573#define vstore_partial_8_4 vstore_partial_4
574#define vstore_partial_8_5 vstore_partial_5
575#define vstore_partial_8_6 vstore_partial_6
576#define vstore_partial_8_7 vstore_partial_7
577#define vstore_partial_8_8 vstore_partial_8
SiCong Li0ea50e32020-11-05 09:18:11 +0000578#define vstore_partial_8_9 NO_STORE
579#define vstore_partial_8_10 NO_STORE
580#define vstore_partial_8_11 NO_STORE
581#define vstore_partial_8_12 NO_STORE
582#define vstore_partial_8_13 NO_STORE
583#define vstore_partial_8_14 NO_STORE
584#define vstore_partial_8_15 NO_STORE
585#define vstore_partial_8_16 NO_STORE
SiCong Li3a501662020-06-26 10:02:06 +0100586// Size == 16
Giorgio Arenad304adb2020-10-02 10:20:11 +0100587#define vstore_partial_16_0 NO_STORE
SiCong Li3a501662020-06-26 10:02:06 +0100588#define vstore_partial_16_1 vstore_partial_1
589#define vstore_partial_16_2 vstore_partial_2
590#define vstore_partial_16_3 vstore_partial_3
591#define vstore_partial_16_4 vstore_partial_4
592#define vstore_partial_16_5 vstore_partial_5
593#define vstore_partial_16_6 vstore_partial_6
594#define vstore_partial_16_7 vstore_partial_7
595#define vstore_partial_16_8 vstore_partial_8
596#define vstore_partial_16_9 vstore_partial_9
597#define vstore_partial_16_10 vstore_partial_10
598#define vstore_partial_16_11 vstore_partial_11
599#define vstore_partial_16_12 vstore_partial_12
600#define vstore_partial_16_13 vstore_partial_13
601#define vstore_partial_16_14 vstore_partial_14
602#define vstore_partial_16_15 vstore_partial_15
603#define vstore_partial_16_16 vstore_partial_16
604
605/** Partial vstore. Store the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of vstore ops
606 * @name vstore_partial_n
607 *
608 * @note @p DATA needs to be a vector not a scalar
609 * @note n needs to be <= the vector width of the input variable @p DATA
610 * eg 1: Valid
611 * vstore_partial_15(var:float16, 0, 0xabcd);
612 * eg 2: Invalid
613 * vstore_partial_7(var:float4, 0, 0xabcd);
614 *
615 * @note in cases n == 1, 2, 3, 4, 8, 16, no extra vstore is invoked, thus there's no performance penalty.
616 *
617 * @param[in] DATA The name of the variable
618 * @param[in] OFFSET Offset in n
619 * @param[in] PTR The base pointer
620 * @{
621 */
622#define vstore_partial_1(DATA, OFFSET, PTR) \
623 vstore1(DATA.s0, OFFSET, PTR);
624
625#define vstore_partial_2(DATA, OFFSET, PTR) \
626 vstore2(DATA.s01, OFFSET, PTR);
627
628#define vstore_partial_3(DATA, OFFSET, PTR) \
629 vstore3(DATA.s012, OFFSET, PTR);
630
631#define vstore_partial_4(DATA, OFFSET, PTR) \
632 vstore4(DATA.s0123, OFFSET, PTR);
633
634#define vstore_partial_5(DATA, OFFSET, PTR) \
635 vstore_partial_4(DATA.s0123, OFFSET, PTR); \
SiCong Li3b64e3e2020-07-28 09:01:28 +0100636 vstore1(DATA.s4, OFFSET, PTR + 4);
SiCong Li3a501662020-06-26 10:02:06 +0100637
638#define vstore_partial_6(DATA, OFFSET, PTR) \
639 vstore_partial_4(DATA.s0123, OFFSET, PTR); \
640 vstore_partial_2(DATA.s45, OFFSET, PTR + 4);
641
642#define vstore_partial_7(DATA, OFFSET, PTR) \
643 vstore_partial_4(DATA.s0123, OFFSET, PTR); \
644 vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
645
646#define vstore_partial_8(DATA, OFFSET, PTR) \
647 vstore8(DATA.s01234567, OFFSET, PTR);
648
649#define vstore_partial_9(DATA, OFFSET, PTR) \
650 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
SiCong Li3b64e3e2020-07-28 09:01:28 +0100651 vstore1(DATA.s8, OFFSET, PTR + 8);
SiCong Li3a501662020-06-26 10:02:06 +0100652
653#define vstore_partial_10(DATA, OFFSET, PTR) \
654 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
655 vstore_partial_2(DATA.s89, OFFSET, PTR + 8);
656
657#define vstore_partial_11(DATA, OFFSET, PTR) \
658 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
659 vstore_partial_3(DATA.s89a, OFFSET, PTR + 8);
660
661#define vstore_partial_12(DATA, OFFSET, PTR) \
662 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
663 vstore_partial_4(DATA.s89ab, OFFSET, PTR + 8);
664
665#define vstore_partial_13(DATA, OFFSET, PTR) \
666 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
Giorgio Arenad304adb2020-10-02 10:20:11 +0100667 vstore_partial_5(DATA.s89abcdef, OFFSET, PTR + 8);
SiCong Li3a501662020-06-26 10:02:06 +0100668
669#define vstore_partial_14(DATA, OFFSET, PTR) \
670 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
Giorgio Arenad304adb2020-10-02 10:20:11 +0100671 vstore_partial_6(DATA.s89abcdef, OFFSET, PTR + 8);
SiCong Li3a501662020-06-26 10:02:06 +0100672
673#define vstore_partial_15(DATA, OFFSET, PTR) \
674 vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
Giorgio Arenad304adb2020-10-02 10:20:11 +0100675 vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
SiCong Li3a501662020-06-26 10:02:06 +0100676
677#define vstore_partial_16(DATA, OFFSET, PTR) \
678 vstore16(DATA, OFFSET, PTR);
679/** @} */ // end of groupd vstore_partial_n
680/** @} */ // end of groupd VSTORE_PARTIAL
681
Gian Marco Iodice0c17aa22019-09-27 09:23:15 +0100682// Convert built-in functions with _sat modifier are not supported in floating point so we create defines
683// without _sat to overcome this issue
684#define convert_float_sat convert_float
685#define convert_float1_sat convert_float
686#define convert_float2_sat convert_float2
687#define convert_float3_sat convert_float3
688#define convert_float4_sat convert_float4
689#define convert_float8_sat convert_float8
690#define convert_float16_sat convert_float16
691#define convert_half_sat convert_float
692#define convert_half1_sat convert_half
693#define convert_half2_sat convert_half2
694#define convert_half3_sat convert_half3
695#define convert_half4_sat convert_half4
696#define convert_half8_sat convert_half8
697#define convert_half16_sat convert_half16
698
Michele Di Giorgioa046e162019-10-08 09:36:26 +0100699#define convert_float1 convert_float
700#define convert_half1 convert_half
701#define convert_char1 convert_char
702#define convert_uchar1 convert_uchar
703#define convert_short1 convert_short
704#define convert_ushort1 convert_ushort
705#define convert_int1 convert_int
706#define convert_uint1 convert_uint
707#define convert_long1 convert_long
708#define convert_ulong1 convert_ulong
709#define convert_double1 convert_double
710
711#define convert_char1_sat convert_char_sat
712#define convert_uchar1_sat convert_uchar_sat
Sheri Zhang4f1650f2021-04-15 12:58:20 +0100713#define convert_uchar2_sat convert_uchar2_sat
714#define convert_uchar3_sat convert_uchar3_sat
715#define convert_uchar4_sat convert_uchar4_sat
716#define convert_uchar8_sat convert_uchar8_sat
717#define convert_uchar16_sat convert_uchar16_sat
Michele Di Giorgioa046e162019-10-08 09:36:26 +0100718#define convert_short1_sat convert_short_sat
719#define convert_ushort1_sat convert_ushort_sat
720#define convert_int1_sat convert_int_sat
721#define convert_uint1_sat convert_uint_sat
722#define convert_long1_sat convert_long_sat
723#define convert_ulong1_sat convert_ulong_sat
724#define convert_double1_sat convert_double_sat
725
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100726#define VEC_DATA_TYPE_STR(type, size) type##size
727#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
728
729#define CONVERT_STR(x, type) (convert_##type((x)))
730#define CONVERT(x, type) CONVERT_STR(x, type)
731
732#define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
733#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
734
735#define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
736#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
737
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000738#define select_vec_dt_uchar(size) uchar##size
739#define select_vec_dt_char(size) char##size
740#define select_vec_dt_ushort(size) ushort##size
741#define select_vec_dt_short(size) short##size
742#define select_vec_dt_half(size) short##size
743#define select_vec_dt_uint(size) uint##size
744#define select_vec_dt_int(size) int##size
745#define select_vec_dt_float(size) int##size
746#define select_vec_dt_ulong(size) ulong##size
747#define select_vec_dt_long(size) long##size
Giorgio Arenad056e572020-10-12 11:53:51 +0100748
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000749#define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
750#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
751#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
752
Giorgio Arenae36208c2021-01-21 14:53:56 +0000753#define signed_int_vec_dt_uchar(size) char##size
754#define signed_int_vec_dt_char(size) char##size
755#define signed_int_vec_dt_ushort(size) short##size
756#define signed_int_vec_dt_short(size) short##size
757#define signed_int_vec_dt_half(size) short##size
758#define signed_int_vec_dt_uint(size) int##size
759#define signed_int_vec_dt_int(size) int##size
760#define signed_int_vec_dt_float(size) int##size
761#define signed_int_vec_dt_ulong(size) long##size
762#define signed_int_vec_dt_long(size) long##size
763
764#define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
765#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
766#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
767
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000768#define sum_reduce_1(x) (x)
769#define sum_reduce_2(x) ((x).s0) + ((x).s1)
770#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
771#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
772#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
773#define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
774
775#define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
776#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
777
Giorgio Arena3ecf9fe2021-04-28 16:11:51 +0100778#define prod_reduce_1(x) (x)
779#define prod_reduce_2(x) ((x).s0) * ((x).s1)
780#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
781#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
782#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
783#define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
784
785#define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
786#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
787
Giorgio Arena2d1a8352020-10-26 15:04:08 +0000788#define max_reduce_1(x) (x)
789#define max_reduce_2(x) max(((x).s0), ((x).s1))
790#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
791#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
792#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
793#define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
794
795#define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
796#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
Giorgio Arenad056e572020-10-12 11:53:51 +0100797
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100798#define VECTOR_DECLARATION(name) \
799 __global uchar *name##_ptr, \
800 uint name##_stride_x, \
801 uint name##_step_x, \
802 uint name##_offset_first_element_in_bytes
803
804#define IMAGE_DECLARATION(name) \
805 __global uchar *name##_ptr, \
806 uint name##_stride_x, \
807 uint name##_step_x, \
808 uint name##_stride_y, \
809 uint name##_step_y, \
810 uint name##_offset_first_element_in_bytes
811
812#define TENSOR3D_DECLARATION(name) \
813 __global uchar *name##_ptr, \
814 uint name##_stride_x, \
815 uint name##_step_x, \
816 uint name##_stride_y, \
817 uint name##_step_y, \
818 uint name##_stride_z, \
819 uint name##_step_z, \
820 uint name##_offset_first_element_in_bytes
821
steniu01868e5412017-07-17 23:16:00 +0100822#define TENSOR4D_DECLARATION(name) \
823 __global uchar *name##_ptr, \
824 uint name##_stride_x, \
825 uint name##_step_x, \
826 uint name##_stride_y, \
827 uint name##_step_y, \
828 uint name##_stride_z, \
829 uint name##_step_z, \
830 uint name##_stride_w, \
831 uint name##_step_w, \
832 uint name##_offset_first_element_in_bytes
833
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100834#define CONVERT_TO_VECTOR_STRUCT(name) \
835 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
836
837#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
838 update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
839
840#define CONVERT_TO_IMAGE_STRUCT(name) \
841 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
842
843#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
844 update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
845
steniu01868e5412017-07-17 23:16:00 +0100846#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
847 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
848
Anthony Barbier7ff47a32017-07-11 16:54:04 +0100849#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
850 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
851
steniu010d523cc2017-07-13 14:24:23 +0100852#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
853 update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
854
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100855#define CONVERT_TO_TENSOR3D_STRUCT(name) \
856 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
857 name##_stride_z, name##_step_z)
858
859#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
860 update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
861
steniu01868e5412017-07-17 23:16:00 +0100862#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size) \
863 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
Michalis Spyrou5237e012018-01-17 09:40:27 +0000864 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
steniu01868e5412017-07-17 23:16:00 +0100865
866#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
867 update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
868
Gian Marco Iodice4d81d752020-07-14 15:05:31 +0100869#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name) \
870 tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
871 name##_stride_z, name##_step_z)
872
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100873/** Structure to hold Vector information */
874typedef struct Vector
875{
876 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
877 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
878 int stride_x; /**< Stride of the image in X dimension (in bytes) */
879} Vector;
880
881/** Structure to hold Image information */
882typedef struct Image
883{
884 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
885 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
886 int stride_x; /**< Stride of the image in X dimension (in bytes) */
887 int stride_y; /**< Stride of the image in Y dimension (in bytes) */
888} Image;
889
890/** Structure to hold 3D tensor information */
891typedef struct Tensor3D
892{
893 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
894 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
895 int stride_x; /**< Stride of the image in X dimension (in bytes) */
896 int stride_y; /**< Stride of the image in Y dimension (in bytes) */
897 int stride_z; /**< Stride of the image in Z dimension (in bytes) */
898} Tensor3D;
899
steniu01868e5412017-07-17 23:16:00 +0100900/** Structure to hold 4D tensor information */
901typedef struct Tensor4D
902{
903 __global uchar *ptr; /**< Pointer to the starting postion of the buffer */
904 int offset_first_element_in_bytes; /**< The offset of the first element in the source image */
905 int stride_x; /**< Stride of the image in X dimension (in bytes) */
906 int stride_y; /**< Stride of the image in Y dimension (in bytes) */
907 int stride_z; /**< Stride of the image in Z dimension (in bytes) */
908 int stride_w; /**< Stride of the image in W dimension (in bytes) */
909} Tensor4D;
910
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100911/** Wrap vector information into an Vector structure, and make the pointer point at this workitem's data.
912 *
913 * @param[in] ptr Pointer to the starting postion of the buffer
914 * @param[in] offset_first_element_in_bytes The offset of the first element in the source vector
915 * @param[in] stride_x Stride of the vector in X dimension (in bytes)
916 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes)
917 *
918 * @return An image object
919 */
Georgios Pinitasaf7f7402018-10-25 16:04:40 +0100920inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100921{
922 Vector vector =
923 {
924 .ptr = ptr,
925 .offset_first_element_in_bytes = offset_first_element_in_bytes,
926 .stride_x = stride_x,
927 };
928 vector.ptr += vector.offset_first_element_in_bytes + get_global_id(0) * step_x;
929 return vector;
930}
931
932/** Wrap image information into an Image structure, and make the pointer point at this workitem's data.
933 *
934 * @param[in] ptr Pointer to the starting postion of the buffer
935 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
936 * @param[in] stride_x Stride of the image in X dimension (in bytes)
937 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes)
938 * @param[in] stride_y Stride of the image in Y dimension (in bytes)
939 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes)
940 *
941 * @return An image object
942 */
Georgios Pinitasaf7f7402018-10-25 16:04:40 +0100943inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100944{
945 Image img =
946 {
947 .ptr = ptr,
948 .offset_first_element_in_bytes = offset_first_element_in_bytes,
949 .stride_x = stride_x,
950 .stride_y = stride_y
951 };
952 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
953 return img;
954}
955
Anthony Barbier7ff47a32017-07-11 16:54:04 +0100956/** Wrap 3D tensor information into an image structure, and make the pointer point at this workitem's data.
957 *
958 * @param[in] ptr Pointer to the starting postion of the buffer
959 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
960 * @param[in] stride_x Stride of the image in X dimension (in bytes)
961 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes)
962 * @param[in] stride_y Stride of the image in Y dimension (in bytes)
963 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes)
964 * @param[in] stride_z Stride of the image in Z dimension (in bytes)
965 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes)
966 *
967 * @return A 3D tensor object
968 */
Georgios Pinitasaf7f7402018-10-25 16:04:40 +0100969inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
Anthony Barbier7ff47a32017-07-11 16:54:04 +0100970{
971 Image img =
972 {
973 .ptr = ptr,
974 .offset_first_element_in_bytes = offset_first_element_in_bytes,
975 .stride_x = stride_x,
976 .stride_y = stride_y
977 };
978 img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
979 return img;
980}
981
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100982/** Wrap 3D tensor information into an tensor structure, and make the pointer point at this workitem's data.
983 *
984 * @param[in] ptr Pointer to the starting postion of the buffer
985 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
986 * @param[in] stride_x Stride of the image in X dimension (in bytes)
987 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes)
988 * @param[in] stride_y Stride of the image in Y dimension (in bytes)
989 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes)
990 * @param[in] stride_z Stride of the image in Z dimension (in bytes)
991 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes)
992 *
993 * @return A 3D tensor object
994 */
Georgios Pinitasaf7f7402018-10-25 16:04:40 +0100995inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100996{
997 Tensor3D tensor =
998 {
999 .ptr = ptr,
1000 .offset_first_element_in_bytes = offset_first_element_in_bytes,
1001 .stride_x = stride_x,
1002 .stride_y = stride_y,
1003 .stride_z = stride_z
1004 };
1005 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
1006 return tensor;
1007}
1008
Gian Marco Iodice4d81d752020-07-14 15:05:31 +01001009/** Wrap 3D tensor information into an tensor structure.
1010 *
1011 * @param[in] ptr Pointer to the starting postion of the buffer
1012 * @param[in] offset_first_element_in_bytes The offset of the first element in the source image
1013 * @param[in] stride_x Stride of the image in X dimension (in bytes)
1014 * @param[in] step_x stride_x * number of elements along X processed per workitem(in bytes)
1015 * @param[in] stride_y Stride of the image in Y dimension (in bytes)
1016 * @param[in] step_y stride_y * number of elements along Y processed per workitem(in bytes)
1017 * @param[in] stride_z Stride of the image in Z dimension (in bytes)
1018 * @param[in] step_z stride_z * number of elements along Z processed per workitem(in bytes)
1019 *
1020 * @return A 3D tensor object
1021 */
1022inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
1023{
1024 Tensor3D tensor =
1025 {
1026 .ptr = ptr,
1027 .offset_first_element_in_bytes = offset_first_element_in_bytes,
1028 .stride_x = stride_x,
1029 .stride_y = stride_y,
1030 .stride_z = stride_z
1031 };
1032 return tensor;
1033}
1034
Georgios Pinitasaf7f7402018-10-25 16:04:40 +01001035inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
steniu01868e5412017-07-17 23:16:00 +01001036 uint step_w,
1037 uint mod_size)
1038{
1039 Tensor4D tensor =
1040 {
1041 .ptr = ptr,
1042 .offset_first_element_in_bytes = offset_first_element_in_bytes,
1043 .stride_x = stride_x,
1044 .stride_y = stride_y,
1045 .stride_z = stride_z,
1046 .stride_w = stride_w
1047 };
1048
1049 tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
1050 return tensor;
1051}
1052
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001053/** Get the pointer position of a Vector
1054 *
1055 * @param[in] vec Pointer to the starting position of the buffer
1056 * @param[in] x Relative X position
1057 */
Georgios Pinitasaf7f7402018-10-25 16:04:40 +01001058inline __global const uchar *vector_offset(const Vector *vec, int x)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001059{
1060 return vec->ptr + x * vec->stride_x;
1061}
1062
1063/** Get the pointer position of a Image
1064 *
1065 * @param[in] img Pointer to the starting position of the buffer
1066 * @param[in] x Relative X position
1067 * @param[in] y Relative Y position
1068 */
Georgios Pinitasaf7f7402018-10-25 16:04:40 +01001069inline __global uchar *offset(const Image *img, int x, int y)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001070{
1071 return img->ptr + x * img->stride_x + y * img->stride_y;
1072}
1073
1074/** Get the pointer position of a Tensor3D
1075 *
Gian Marco Iodice3a623242017-07-25 10:25:53 +01001076 * @param[in] tensor Pointer to the starting position of the buffer
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001077 * @param[in] x Relative X position
1078 * @param[in] y Relative Y position
1079 * @param[in] z Relative Z position
1080 */
Georgios Pinitasaf7f7402018-10-25 16:04:40 +01001081inline __global const uchar *tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001082{
1083 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z;
1084}
1085
steniu01868e5412017-07-17 23:16:00 +01001086/** Get the pointer position of a Tensor4D
1087 *
1088 * @param[in] tensor Pointer to the starting position of the buffer
1089 * @param[in] x Relative X position
1090 * @param[in] y Relative Y position
1091 * @param[in] z Relative Z position
1092 * @param[in] w Relative W position
1093 */
Georgios Pinitasaf7f7402018-10-25 16:04:40 +01001094inline __global const uchar *tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
steniu01868e5412017-07-17 23:16:00 +01001095{
1096 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + w * tensor->stride_w;
1097}
1098
Gian Marco Iodice4d81d752020-07-14 15:05:31 +01001099/** Get the offset for a given linear index of a Tensor3D
1100 *
1101 * @param[in] tensor Pointer to the starting position of the buffer
1102 * @param[in] width Width of the input tensor
1103 * @param[in] height Height of the input tensor
1104 * @param[in] depth Depth of the input tensor
1105 * @param[in] index Linear index
1106 */
1107inline __global const uchar *tensor3D_index2ptr(const Tensor3D *tensor, uint width, uint height, uint depth, uint index)
1108{
1109 uint num_elements = width * height;
1110
1111 const uint z = index / num_elements;
1112
1113 index %= num_elements;
1114
1115 const uint y = index / width;
1116
1117 index %= width;
1118
1119 const uint x = index;
1120
1121 return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
1122}
1123
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001124#endif // _HELPER_H