blob: 0b211a6d1f43518d8989cecd1861371424ead3c0 [file] [log] [blame]
Anthony Barbier6ff3b192017-09-04 18:44:23 +01001/*
Michele Di Giorgiod9eaf612020-07-08 11:12:57 +01002 * Copyright (c) 2017-2020 Arm Limited.
Anthony Barbier6ff3b192017-09-04 18:44:23 +01003 *
4 * SPDX-License-Identifier: MIT
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in all
14 * copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24#include "helpers.h"
25
Pablo Telloeb6c88a2019-02-07 15:53:19 +000026#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
27#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
28#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
Manuel Bottini8481d832019-12-10 15:28:40 +000029#define VEC_QUANT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Pablo Telloeb6c88a2019-02-07 15:53:19 +000030#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
31#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
Manuel Bottini8481d832019-12-10 15:28:40 +000032inline VEC_QUANT requantize(VEC_QUANT input, float in_offset, float out_offset, float in_scale, float out_scale)
Pablo Telloeb6c88a2019-02-07 15:53:19 +000033{
34 const VEC_FLOAT in_f32 = (CONVERT(input, VEC_FLOAT) - (VEC_FLOAT)((float)in_offset)) * (VEC_FLOAT)((float)in_scale);
35 const VEC_FLOAT out_f32 = in_f32 / ((VEC_FLOAT)(float)out_scale) + ((VEC_FLOAT)((float)out_offset));
Manuel Bottini8481d832019-12-10 15:28:40 +000036 const VEC_QUANT res_q8 = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT), VEC_QUANT);
37 return res_q8;
Pablo Telloeb6c88a2019-02-07 15:53:19 +000038}
39#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +010040
Pablo Telloeb6c88a2019-02-07 15:53:19 +000041#if defined(DATA_TYPE) && defined(VEC_SIZE)
Michele Di Giorgio27400b92018-11-01 13:44:05 +000042#if defined(DEPTH) && defined(ELEMENT_SIZE)
43
44#if defined(INPUT1_WIDTH)
45
46#if ELEMENT_SIZE == 1
47#define COND_DATA_TYPE char
48#elif ELEMENT_SIZE == 2
49#define COND_DATA_TYPE short
50#elif ELEMENT_SIZE == 4
51#define COND_DATA_TYPE int
52#else // ELEMENT_SIZE
53#error "Element size not supported"
54#endif // ELEMENT_SIZE
55
56#if VEC_SIZE == 2
57#define SEQ ((int2)(0, 1))
58#elif VEC_SIZE == 4
59#define SEQ ((int4)(0, 1, 2, 3))
60#elif VEC_SIZE == 8
61#define SEQ ((int8)(0, 1, 2, 3, 4, 5, 6, 7))
62#elif VEC_SIZE == 16
63#define SEQ ((int16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))
64#else // VEC_SIZE
65#error "Vector size not supported"
66#endif // VEC_SIZE
Pablo Telloeb6c88a2019-02-07 15:53:19 +000067
Michele Di Giorgio27400b92018-11-01 13:44:05 +000068/** This kernel concatenates two input tensors into the output tensor along the first dimension
69 *
70 * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
71 * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
72 * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
73 * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
74 * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
75 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +010076 * @param[in] src1_ptr Pointer to the source tensor. Supported data types: All.
Michele Di Giorgio27400b92018-11-01 13:44:05 +000077 * @param[in] src1_stride_x Stride of the source tensor in X dimension (in bytes)
78 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
79 * @param[in] src1_stride_y Stride of the source tensor in Y dimension (in bytes)
80 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
81 * @param[in] src1_stride_z Stride of the source tensor in Z dimension (in bytes)
82 * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
83 * @param[in] src1_stride_w Stride of the first source tensor in Z dimension (in bytes)
84 * @param[in] src1_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
85 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source tensor
86 * @param[in] src2_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
87 * @param[in] src2_stride_x Stride of the source tensor in X dimension (in bytes)
88 * @param[in] src2_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
89 * @param[in] src2_stride_y Stride of the source tensor in Y dimension (in bytes)
90 * @param[in] src2_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
91 * @param[in] src2_stride_z Stride of the source tensor in Z dimension (in bytes)
92 * @param[in] src2_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
93 * @param[in] src2_stride_w Stride of the first source tensor in Z dimension (in bytes)
94 * @param[in] src2_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
95 * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source tensor
96 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src1_ptr
97 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
98 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
99 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
100 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
101 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
102 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
103 * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
104 * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
105 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgio8e150a12018-12-21 15:20:56 +0000106 * @param[in] src1_pad_right Right paddings of the first input tensor in unit of elements
107 * @param[in] src1_pad_left Left paddings of the second input tensor in unit of elements
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000108 */
109__kernel void concatenate_width_x2(
110 TENSOR4D_DECLARATION(src1),
111 TENSOR4D_DECLARATION(src2),
Michele Di Giorgio8e150a12018-12-21 15:20:56 +0000112 TENSOR4D_DECLARATION(dst),
113 uint src1_pad_right,
114 uint src2_pad_left)
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000115{
116 Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
117
118 // Calculate input indices
119 const int x = get_global_id(0) * (int)VEC_SIZE;
120 const int y = get_global_id(1);
121 const int z = get_global_id(2) % (int)DEPTH;
122 const int w = get_global_id(2) / (int)DEPTH;
Michele Di Giorgio8e150a12018-12-21 15:20:56 +0000123 const int x1 = min(x, (int)INPUT1_WIDTH + (int)src1_pad_right - (int)VEC_SIZE);
124 const int x2 = max(x - (int)INPUT1_WIDTH, -(int)src2_pad_left);
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000125
126 // Calculate inputs and output addresses
127 const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
128 const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
129
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000130 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
131 src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
132 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
133 src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000134
Georgios Pinitas6631ac22019-04-17 12:12:56 +0100135#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000136 src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
137 src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
Georgios Pinitas6631ac22019-04-17 12:12:56 +0100138#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) */
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000139 const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
140 const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
141 const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) values = select(src2_values, src1_values, cond);
142
143 VSTORE(VEC_SIZE)
144 (values, 0, (__global DATA_TYPE *)dst.ptr);
145}
146
147#if defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH)
148/** This kernel concatenates four input tensors into the output tensor along the first dimension
149 *
150 * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
151 * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
152 * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
153 * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
154 * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
155 * @note Second input tensor width should be given as a preprocessor argument using -DINPUT2_WIDTH=width. e.g. -DINPUT2_WIDTH=8
156 * @note Third input tensor width should be given as a preprocessor argument using -DINPUT3_WIDTH=width. e.g. -DINPUT3_WIDTH=8
157 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100158 * @param[in] src1_ptr Pointer to the source tensor. Supported data types: All
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000159 * @param[in] src1_stride_x Stride of the source tensor in X dimension (in bytes)
160 * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
161 * @param[in] src1_stride_y Stride of the source tensor in Y dimension (in bytes)
162 * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
163 * @param[in] src1_stride_z Stride of the source tensor in Z dimension (in bytes)
164 * @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
165 * @param[in] src1_stride_w Stride of the first source tensor in Z dimension (in bytes)
166 * @param[in] src1_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
167 * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source tensor
168 * @param[in] src2_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
169 * @param[in] src2_stride_x Stride of the source tensor in X dimension (in bytes)
170 * @param[in] src2_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
171 * @param[in] src2_stride_y Stride of the source tensor in Y dimension (in bytes)
172 * @param[in] src2_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
173 * @param[in] src2_stride_z Stride of the source tensor in Z dimension (in bytes)
174 * @param[in] src2_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
175 * @param[in] src2_stride_w Stride of the first source tensor in Z dimension (in bytes)
176 * @param[in] src2_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
177 * @param[in] src2_offset_first_element_in_bytes The offset of the first element in the source tensor
178 * @param[in] src3_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
179 * @param[in] src3_stride_x Stride of the source tensor in X dimension (in bytes)
180 * @param[in] src3_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
181 * @param[in] src3_stride_y Stride of the source tensor in Y dimension (in bytes)
182 * @param[in] src3_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
183 * @param[in] src3_stride_z Stride of the source tensor in Z dimension (in bytes)
184 * @param[in] src3_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
185 * @param[in] src3_stride_w Stride of the first source tensor in Z dimension (in bytes)
186 * @param[in] src3_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
187 * @param[in] src3_offset_first_element_in_bytes The offset of the first element in the source tensor
188 * @param[in] src4_ptr Pointer to the source tensor. Supported data types: same as @p src1_ptr
189 * @param[in] src4_stride_x Stride of the source tensor in X dimension (in bytes)
190 * @param[in] src4_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
191 * @param[in] src4_stride_y Stride of the source tensor in Y dimension (in bytes)
192 * @param[in] src4_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
193 * @param[in] src4_stride_z Stride of the source tensor in Z dimension (in bytes)
194 * @param[in] src4_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
195 * @param[in] src4_stride_w Stride of the first source tensor in Z dimension (in bytes)
196 * @param[in] src4_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
197 * @param[in] src4_offset_first_element_in_bytes The offset of the first element in the source tensor
198 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src1_ptr
199 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
200 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
201 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
202 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
203 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
204 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
205 * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
206 * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
207 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michele Di Giorgio8e150a12018-12-21 15:20:56 +0000208 * @param[in] src1_pad_right Right paddings of the first input tensor in unit of elements
209 * @param[in] src2_pad_left Left paddings of the second input tensor in unit of elements
210 * @param[in] src2_pad_right Right paddings of the second input tensor in unit of elements
211 * @param[in] src3_pad_left Left paddings of the third input tensor in unit of elements
212 * @param[in] src3_pad_right Right paddings of the third input tensor in unit of elements
213 * @param[in] src4_pad_left Left paddings of the fourth input tensor in unit of elements
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000214 */
215__kernel void concatenate_width_x4(
216 TENSOR4D_DECLARATION(src1),
217 TENSOR4D_DECLARATION(src2),
218 TENSOR4D_DECLARATION(src3),
219 TENSOR4D_DECLARATION(src4),
Michele Di Giorgio8e150a12018-12-21 15:20:56 +0000220 TENSOR4D_DECLARATION(dst),
221 uint src1_pad_right,
222 uint src2_pad_left,
223 uint src2_pad_right,
224 uint src3_pad_left,
225 uint src3_pad_right,
226 uint src4_pad_left)
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000227{
228 Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
229
230 // Calculate input indices
231 const int x = get_global_id(0) * (int)VEC_SIZE;
232 const int y = get_global_id(1);
233 const int z = get_global_id(2) % (int)DEPTH;
234 const int w = get_global_id(2) / (int)DEPTH;
235
Michele Di Giorgio8e150a12018-12-21 15:20:56 +0000236 const int x1 = min(x, (int)INPUT1_WIDTH + (int)src1_pad_right - (int)VEC_SIZE);
237 const int x2 = min(max(x - (int)INPUT1_WIDTH, -(int)src2_pad_left), (int)INPUT2_WIDTH + (int)src2_pad_right - (int)VEC_SIZE);
238 const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, -(int)src3_pad_left), (int)INPUT3_WIDTH + (int)src3_pad_right - (int)VEC_SIZE);
239 const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, -(int)src4_pad_left);
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000240
241 // Calculate inputs and output addresses
242 const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
243 const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
244 const __global uchar *in3_ptr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * (int)src3_stride_x + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
245 const __global uchar *in4_ptr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * (int)src4_stride_x + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
246
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000247 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
248 src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr);
249 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
250 src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr);
251 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
252 src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in3_ptr);
253 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
254 src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in4_ptr);
255
256#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4)
257 src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
258 src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
259 src3_values = requantize(src3_values, OFFSET_IN3, OFFSET_OUT, SCALE_IN3, SCALE_OUT);
260 src4_values = requantize(src4_values, OFFSET_IN4, OFFSET_OUT, SCALE_IN4, SCALE_OUT);
261#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) */
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000262
263 const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x);
264
265 const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in2 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
266 const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in3 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH + INPUT2_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
267 const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond_in4 = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE));
268
269 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
270 values = select(src2_values, src1_values, cond_in2);
271 values = select(src3_values, values, cond_in3);
272 values = select(src4_values, values, cond_in4);
273
274 VSTORE(VEC_SIZE)
275 (values, 0, (__global DATA_TYPE *)dst.ptr);
276}
277#endif /* defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH) */
278#endif /* defined(INPUT1_WIDTH) */
279#endif /* defined(DEPTH) && defined(ELEMENT_SIZE) */
280
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100281#if defined(WIDTH_OFFSET) && defined(DEPTH)
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100282/** This kernel concatenates the input tensor into the output tensor along the first dimension
283 *
Gian Marco Iodice1d1f32c2018-08-10 09:34:11 +0100284 * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100285 * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
Gian Marco Iodice1d1f32c2018-08-10 09:34:11 +0100286 * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
Michele Di Giorgio27400b92018-11-01 13:44:05 +0000287 * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
Gian Marco Iodice1d1f32c2018-08-10 09:34:11 +0100288 *
289 * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100290 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
291 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
292 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
293 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
294 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
295 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100296 * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
297 * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100298 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
299 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
300 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
301 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
302 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
303 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
304 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
305 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100306 * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
307 * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100308 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100309 */
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000310
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100311__kernel void concatenate_width(
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100312 TENSOR4D_DECLARATION(src),
313 TENSOR4D_DECLARATION(dst))
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100314{
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100315 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, DEPTH);
316 Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100317
318 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
319 source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
320
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000321#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
Manuel Bottini8481d832019-12-10 15:28:40 +0000322 const VEC_QUANT out = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000323 VSTORE(VEC_SIZE)
324 (out, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET);
325#else /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100326 VSTORE(VEC_SIZE)
Gian Marco Iodice1d1f32c2018-08-10 09:34:11 +0100327 (source_values, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET);
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000328#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100329}
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000330
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100331#endif /* defined(WIDTH_OFFSET) && defined(DEPTH) */
Michalis Spyrou55b3d122018-05-09 09:59:23 +0100332
Pablo Tello6a14adb2019-03-05 17:33:08 +0000333#if defined(HEIGHT_OFFSET) && defined(DEPTH) && defined(VEC_SIZE)
334/** This kernel concatenates the input tensor into the output tensor along the second dimension
335 *
336 * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
337 * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
338 * @note Vector sizes supported are 2,4,8 and 16.
339 * @note The offset for the second spatial dimension has to be passed at compile time using -DHEIGHT_OFFSET. i.e. -DHEIGHT_OFFSET=128
340 * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
341 *
342 * @param[in] src_ptr Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
343 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
344 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
345 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
346 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
347 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
348 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
349 * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
350 * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
351 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
352 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
353 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
354 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
355 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
356 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
357 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
358 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
359 * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
360 * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
361 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
362 */
363
364__kernel void concatenate_height(
365 TENSOR4D_DECLARATION(src),
366 TENSOR4D_DECLARATION(dst))
367{
368 Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, DEPTH);
369 Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT(dst, DEPTH);
370
371 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
372 source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
373
374#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
Manuel Bottini8481d832019-12-10 15:28:40 +0000375 const VEC_QUANT out = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
Pablo Tello6a14adb2019-03-05 17:33:08 +0000376 VSTORE(VEC_SIZE)
377 (out, 0, (__global DATA_TYPE *)(dst.ptr + HEIGHT_OFFSET * dst_stride_y));
378#else /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
379 VSTORE(VEC_SIZE)
380 (source_values, 0, (__global DATA_TYPE *)(dst.ptr + HEIGHT_OFFSET * dst_stride_y));
381#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
382}
383
384#endif /* defined(HEIGHT_OFFSET) && defined(DEPTH) */
385
Giorgio Arena53048842020-10-07 16:03:43 +0100386#if defined(VEC_SIZE_LEFTOVER)
387
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100388/** This kernel concatenates the input tensor into the output tensor along the third dimension
389 *
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100390 * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
391 * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
Giorgio Arena53048842020-10-07 16:03:43 +0100392 * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100393 *
Michele Di Giorgiof6f78762020-07-06 11:27:21 +0100394 * @param[in] src_ptr Pointer to the source tensor. Supported data types: All
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100395 * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
396 * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
397 * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
398 * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasac4e8732017-07-05 17:02:25 +0100399 * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
400 * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100401 * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
Georgios Pinitasac4e8732017-07-05 17:02:25 +0100402 * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100403 * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
404 * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
405 * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
406 * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
Georgios Pinitasac4e8732017-07-05 17:02:25 +0100407 * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
408 * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100409 * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
Georgios Pinitasac4e8732017-07-05 17:02:25 +0100410 * @param[in] offsets The offsets to the first valid element of the output tensor in bytes
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100411 */
Vidhya Sudhan Loganathan338595b2019-06-28 14:09:53 +0100412__kernel void concatenate(
Georgios Pinitasac4e8732017-07-05 17:02:25 +0100413 TENSOR3D_DECLARATION(src),
414 TENSOR3D_DECLARATION(dst),
Michalis Spyroua9c44722019-04-05 17:18:36 +0100415 int offset)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100416{
Giorgio Arena53048842020-10-07 16:03:43 +0100417 uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
418
419 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
420 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100421
Georgios Pinitasac4e8732017-07-05 17:02:25 +0100422 VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
Giorgio Arena53048842020-10-07 16:03:43 +0100423 source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100424
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000425#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
Giorgio Arena53048842020-10-07 16:03:43 +0100426 source_values0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
Pablo Telloeb6c88a2019-02-07 15:53:19 +0000427#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
428
Giorgio Arena53048842020-10-07 16:03:43 +0100429 STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + offset, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
Anthony Barbier6ff3b192017-09-04 18:44:23 +0100430}
Giorgio Arena53048842020-10-07 16:03:43 +0100431#endif /* defined(VEC_SIZE_LEFTOVER) */
Michele Di Giorgioe6dbde02018-10-19 15:46:19 +0100432#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) */